diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp index 73a731a69c..ea0e18f07a 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp @@ -60,7 +60,57 @@ struct F32_1D_GNWC_GKXC_GNWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>" // clang-format on }; }; @@ -71,7 +121,54 @@ struct F16_1D_GNWC_GKXC_GNWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>" // clang-format on }; }; @@ -82,7 +179,54 @@ struct BF16_1D_GNWC_GKXC_GNWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>" // clang-format on }; }; @@ -93,7 +237,54 @@ struct S8_1D_GNWC_GKXC_GNWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<1,GNWC,GKXC,EmptyTuple,GNWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>" // clang-format on }; }; @@ -106,7 +297,74 @@ struct F32_2D_GNHWC_GKYXC_GNHWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>" // clang-format on }; }; @@ -117,7 +375,70 @@ struct F16_2D_GNHWC_GKYXC_GNHWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>" // clang-format on }; }; @@ -128,7 +449,70 @@ struct BF16_2D_GNHWC_GKYXC_GNHWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,GNHWC,GKYXC,EmptyTuple,GNHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>" // clang-format on }; }; @@ -151,151 +535,235 @@ struct F32_2D_NHWGC_GKYXC_NHWGK constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -306,219 +774,303 @@ struct F16_2D_NHWGC_GKYXC_NHWGK constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -529,231 +1081,315 @@ struct BF16_2D_NHWGC_GKYXC_NHWGK constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; @@ -764,203 +1400,274 @@ struct S8_2D_NHWGC_GKYXC_NHWGK constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,s8,s8,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,OddC,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,s8,s8,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default>" // clang-format on }; }; @@ -973,43 +1680,108 @@ struct F32_2D_NGCHW_GKCYX_NGKHW constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>" // clang-format on }; }; @@ -1020,48 +1792,125 @@ struct F16_2D_NGCHW_GKCYX_NGKHW constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,16,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,16,16,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,16,16,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,8,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,8,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,8,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>" // clang-format on }; }; @@ -1072,58 +1921,123 @@ struct BF16_2DNGCHW_GKCYX_NGKHWK constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NGCHW,GKCYX,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>" // clang-format on }; }; @@ -1136,7 +2050,7 @@ struct F32_2D_NGCHW_GKYXC_NGKHW constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>" // clang-format on }; }; @@ -1147,7 +2061,19 @@ struct F16_2D_NGCHW_GKYXC_NGKHW constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,16,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,16,16,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,16,16,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,8,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,8,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,8,1,8),4,fp16,fp16,Default,1>" // clang-format on }; }; @@ -1158,7 +2084,7 @@ struct BF16_2D_NGCHW_GKYXC_NGKHW constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>" // clang-format on }; }; @@ -1169,7 +2095,7 @@ struct S8_2D_NGCHW_GKYXC_NGKHW constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NGCHW,GKYXC,EmptyTuple,NGKHW,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>" // clang-format on }; }; @@ -1182,7 +2108,57 @@ struct F32_3D_GNDHWC_GKZYXC_GNDHWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>" // clang-format on }; }; @@ -1193,7 +2169,54 @@ struct F16_3D_GNDHWC_GKZYXC_GNDHWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>" // clang-format on }; }; @@ -1204,7 +2227,54 @@ struct BF16_3D_GNDHWC_GKZYXC_GNDHWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>" // clang-format on }; }; @@ -1215,7 +2285,54 @@ struct S8_3D_GNDHWC_GKZYXC_GNDHWK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,GNDHWC,GKZYXC,EmptyTuple,GNDHWK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>" // clang-format on }; }; @@ -1228,115 +2345,182 @@ struct F32_3D_NDHWGC_GKZYXC_NDHWGK_TF32 constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default>" // clang-format on }; }; @@ -1347,115 +2531,182 @@ struct F32_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -1467,7 +2718,54 @@ struct F16_3D_NDHWGC_GKZYXC_NDHWGK_F8 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; @@ -1478,7 +2776,54 @@ struct F8_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; @@ -1489,7 +2834,54 @@ struct BF8_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>" // clang-format on }; }; @@ -1501,7 +2893,54 @@ struct F8_BF8_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>" // clang-format on }; }; @@ -1513,7 +2952,54 @@ struct BF8_F8_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp8,EmptyTuple,fp8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>" // clang-format on }; }; @@ -1524,166 +3010,234 @@ struct F16_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -1694,175 +3248,243 @@ struct BF16_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; @@ -1873,7 +3495,54 @@ struct S8_3D_NDHWGC_GKZYXC_NDHWGK constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>" // clang-format on }; }; @@ -1886,115 +3555,180 @@ struct F32_3D_NGCDHW_GKCZYX_NDHWGK constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>" // clang-format on }; }; @@ -2005,130 +3739,207 @@ struct F16_3D_NGCDHW_GKCZYX_NDHWGK constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,16,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,16,16,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,16,16,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,8,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,8,1,8),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,8,1,8),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>" // clang-format on }; }; @@ -2139,160 +3950,225 @@ struct BF16_3D_NGCDHW_GKCZYX_NDHWGK constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NGCDHW,GKCZYX,EmptyTuple,NGKDHW,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,PassThrough,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp index 75cb58018e..16cf02288b 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp @@ -90,115 +90,182 @@ struct F32_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -209,115 +276,182 @@ struct F32_TF32_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default>" // clang-format on }; }; @@ -328,166 +462,234 @@ struct F16_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -498,175 +700,243 @@ struct BF16_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK,NHWGK,NHWGK,NHWGK,NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; @@ -677,115 +947,182 @@ struct F32_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -796,115 +1133,182 @@ struct F32_TF32_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32,fp32,fp32,fp32),fp32,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default>" // clang-format on }; }; @@ -915,166 +1319,234 @@ struct F16_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16,fp16,fp16,fp16),fp16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -1085,175 +1557,243 @@ struct BF16_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK,NDHWGK,NDHWGK,NDHWGK,NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16,bf16,bf16,bf16),bf16,PassThrough,PassThrough,BiasNormalizeInInferClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp index b301bab966..5322120df3 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp @@ -87,115 +87,182 @@ struct F32_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -206,115 +273,182 @@ struct F32_TF32_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default>" // clang-format on }; }; @@ -325,166 +459,234 @@ struct F16_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -495,175 +697,243 @@ struct BF16_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,Tuple(NHWGK),NHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; @@ -674,115 +944,182 @@ struct F32_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -793,115 +1130,182 @@ struct F32_TF32_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default>" // clang-format on }; }; @@ -912,166 +1316,234 @@ struct F16_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -1082,175 +1554,243 @@ struct BF16_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,AddClamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp index 8b77b11e0c..478a591f18 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp @@ -55,7 +55,54 @@ struct Bilinear_F32 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>" // clang-format on }; }; @@ -66,7 +113,54 @@ struct Bilinear_F32_TF32 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32),fp32,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>" // clang-format on }; }; @@ -77,7 +171,54 @@ struct Bilinear_F16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16),fp16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>" // clang-format on }; }; @@ -88,7 +229,54 @@ struct Bilinear_BF16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16),bf16,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>" // clang-format on }; }; @@ -99,7 +287,54 @@ struct Bilinear_INT8 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,s8,s8,s32,s8,Tuple(s8),s8,PassThrough,PassThrough,Bilinear,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp index e678fa1258..b6161ab442 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp @@ -67,115 +67,182 @@ struct F32_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -186,115 +253,182 @@ struct F32_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp32,fp32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp32,fp32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default>" // clang-format on }; }; @@ -305,115 +439,182 @@ struct F32_TF32_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default>" // clang-format on }; }; @@ -424,115 +625,182 @@ struct F32_TF32_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,192,16,4,4,32,32,2,3,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,tf32,tf32,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Interwave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,4,4,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,tf32,tf32,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default>" // clang-format on }; }; @@ -543,166 +811,234 @@ struct F16_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -713,166 +1049,234 @@ struct F16_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,fp16,fp16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default>" // clang-format on }; }; @@ -883,175 +1287,243 @@ struct BF16_2D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; @@ -1062,175 +1534,243 @@ struct BF16_3D constexpr static auto expected = { // clang-format off - "", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16>", - "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16>" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,4),2,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,8,1,1,1,Seq(1,32,1,4),4,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,64,32,8,8,16,16,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,16>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,32>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter3x3,MNKPadding,1,64,64,16,16,4,4,16,16,4,1,Seq(4,16,1),Seq(0,2,1),Seq(0,2,1),1,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,8>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,16,64,8,8,16,16,4,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,128,32,64,8,8,32,32,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,128,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,32,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,16,64,64,8,8,16,16,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,128,64,8,8,32,32,1,2,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,16,64,8,8,16,16,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,32,64,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,16,64,8,8,16,16,2,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,128,64,32,64,8,8,32,32,1,1,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,128,64,8,8,32,32,2,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,128,64,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,16,256,64,8,8,16,16,1,4,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,224,256,64,8,8,16,16,7,8,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,16,64,8,8,16,16,4,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),2,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,224,64,8,8,16,16,8,7,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,2,1,Seq(1,64,1,4),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,16,16,8,8,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,2,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v4,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,256,32,8,8,32,32,4,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v5,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,256,32,64,8,8,32,32,2,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,32,256,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,16),8,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,128,64,8,8,32,32,1,2,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,256,64,64,64,8,8,32,32,1,1,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,32,1,8),8,Intrawave,v3,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,128,8,8,16,16,1,1,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(16,4,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Interwave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v1,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Filter1x1Stride1Pad0,MNKPadding,64,16,16,64,8,8,16,16,1,1,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,Seq(8,8,1),Seq(1,0,2),Seq(1,0,2),2,8,8,0,1,1,Seq(1,16,1,4),4,Intrawave,v2,bf16,bf16,false>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,2,8,1,1,1,Seq(1,32,1,8),2,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default>", + "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Clamp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp index 4882c6fde2..fe9cbd7dbb 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp @@ -76,7 +76,54 @@ struct F8_ConvScale constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; @@ -94,7 +141,54 @@ struct F8_BF8_comb1_ConvScale constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,bf8,Default,1>" // clang-format on }; }; @@ -112,7 +206,54 @@ struct F8_BF8_comb2_ConvScale constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,bf8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,bf8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,bf8,Default,1>" // clang-format on }; }; @@ -130,7 +271,54 @@ struct F8_BF8_comb3_ConvScale constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf8,fp8,Default,1>" // clang-format on }; }; @@ -148,7 +336,54 @@ struct F8_float_CombConvScale constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; @@ -166,7 +401,54 @@ struct F8_ConvScaleRelu constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvScaleRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; @@ -184,7 +466,54 @@ struct F8_CombConvScaleRelu constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,UnaryCombinedOp,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; @@ -202,7 +531,54 @@ struct F8_ConvScaleAdd constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK),NDHWGK,fp8,fp8,fp32,fp32,Tuple(fp32),fp8,PassThrough,PassThrough,ConvScaleAdd,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; @@ -220,7 +596,54 @@ struct F8_ConvInvscale constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp8,fp8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp8,fp8,fp32,fp32,EmptyTuple,fp8,PassThrough,PassThrough,ConvInvscale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp8,fp8,Default,1>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp index 7437385e2a..56ecda2c7b 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp @@ -85,7 +85,9 @@ struct DyOp_F32_2 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>" // clang-format on }; }; @@ -96,7 +98,9 @@ struct DyOp_F32_3 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>" // clang-format on }; }; @@ -107,7 +111,9 @@ struct DyOp_F16_2 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>" // clang-format on }; }; @@ -118,7 +124,9 @@ struct DyOp_F16_3 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>" // clang-format on }; }; @@ -129,7 +137,9 @@ struct DyOp_BF16_2 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>" // clang-format on }; }; @@ -140,7 +150,9 @@ struct DyOp_BF16_3 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>" // clang-format on }; }; @@ -151,7 +163,9 @@ struct DyOp_INT8_2 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<2,NHWGC,GKYXC,EmptyTuple,NHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>" // clang-format on }; }; @@ -162,7 +176,9 @@ struct DyOp_INT8_3 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,DynamicUnaryOp,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp index 8144d7bedd..f897fee66b 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp @@ -53,7 +53,54 @@ struct F32 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,fp32,fp32,Default,1>" // clang-format on }; }; @@ -64,7 +111,54 @@ struct F32_TF32 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,16,4,4,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,16,4,4,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,16,4,4,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,16,4,4,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,16,4,4,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,16,4,4,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,16,4,4,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,16,4,4,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,tf32,tf32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp32,fp32,fp32,fp32,EmptyTuple,fp32,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),4,tf32,tf32,Default,1>" // clang-format on }; }; @@ -75,7 +169,54 @@ struct F16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,fp16,fp16,fp32,fp16,EmptyTuple,fp16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,fp16,fp16,Default,1>" // clang-format on }; }; @@ -86,7 +227,54 @@ struct BF16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,bf16,bf16,fp32,bf16,EmptyTuple,bf16,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,bf16,bf16,Default,1>" // clang-format on }; }; @@ -97,7 +285,54 @@ struct S8 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,128,32,8,8,32,32,4,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,32,32,8,8,32,32,2,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,128,64,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,32,128,32,8,8,32,32,1,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,128,64,128,32,8,8,32,32,2,2,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,32,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,256,32,8,8,32,32,2,4,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,128,64,32,8,8,32,32,2,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,256,64,128,32,8,8,32,32,1,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,32,64,32,8,8,32,32,1,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,s8,s8,s32,s8,EmptyTuple,s8,PassThrough,PassThrough,Scale,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),8,s8,s8,Default,1>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp index beebc3f853..aa68a21300 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp @@ -53,7 +53,18 @@ struct F32 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp32,fp32),Tuple(fp32,fp32),fp32,fp32,EmptyTuple,fp32,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>" // clang-format on }; }; @@ -64,7 +75,18 @@ struct F16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(fp16,fp16),Tuple(fp16,fp16),fp32,fp16,EmptyTuple,fp16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>" // clang-format on }; }; @@ -75,7 +97,18 @@ struct BF16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(bf16,bf16),Tuple(bf16,bf16),fp32,bf16,EmptyTuple,bf16,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>" // clang-format on }; }; @@ -86,7 +119,18 @@ struct S8 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,EmptyTuple,NDHWGK,Tuple(s8,s8),Tuple(s8,s8),s32,s8,EmptyTuple,s8,ScaleAdd,ScaleAdd,PassThrough,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,1,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>" // clang-format on }; }; diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp index 79feb298cd..16077db268 100644 --- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp +++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp @@ -54,7 +54,18 @@ struct F32 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,16,4,4,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,16,4,4,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,16,1,16),4,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,16,4,4,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,4,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp32,fp32,fp32,fp32,Tuple(fp32,fp32),fp32,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,16,4,4,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,4,1,1,1,Seq(1,8,1,8),1,fp32,fp32,Default,1>" // clang-format on }; }; @@ -65,7 +76,18 @@ struct F16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,fp16,fp16,fp32,fp16,Tuple(fp16,fp16),fp16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,fp16,fp16,Default,1>" // clang-format on }; }; @@ -76,7 +98,18 @@ struct BF16 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,bf16,bf16,fp32,bf16,Tuple(bf16,bf16),bf16,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,bf16,bf16,Default,1>" // clang-format on }; }; @@ -87,7 +120,18 @@ struct S8 constexpr static auto expected = { // clang-format off - "" + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Default,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,128,128,32,8,8,32,32,2,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,256,256,128,32,8,8,32,32,4,2,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,64,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,32,1,8),8,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,32,32,8,8,32,32,2,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,8,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>", + "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<3,NDHWGC,GKZYXC,Tuple(NDHWGK,G_K),NDHWGK,s8,s8,s32,s8,Tuple(fp32,fp32),s8,PassThrough,PassThrough,ScaleAddScaleAddRelu,Filter1x1Stride1Pad0,MNKPadding,1,64,64,64,32,8,8,32,32,2,2,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,Seq(4,16,1),Seq(1,0,2),Seq(1,0,2),2,1,8,1,1,1,Seq(1,16,1,4),1,s8,s8,Default,1>" // clang-format on }; };