upgrade from clang-format-12 to clang-format-18 (#2568)

* upgrade to clang-format-18 * update to clang-format-18 in pre-commit-config
2026-05-13 09:45:56 +00:00 · 2025-07-28 11:34:07 -07:00
parent 9786087010
commit 504b101da3
373 changed files with 3351 additions and 3760 deletions
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
@@ -116,7 +116,7 @@ struct ReferenceMoeGemm : public device::BaseOperator
 #if CK_USE_PK4_LAYOUT_SHUFFLE
                            v_a = i4_to_f32_gfx9(i4);
 #else
-                            v_a    = i4 - 8;
+                            v_a = i4 - 8;
 #endif
                        }
                        else
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm1_blockscale.hpp
@@ -110,7 +110,7 @@ struct ReferenceMoeGemm1BlockScale : public device::BaseOperator
 #if CK_USE_PK4_LAYOUT_SHUFFLE
                            v_a = i4_to_f32_gfx9(i4);
 #else
-                            v_a    = i4 - 8;
+                            v_a = i4 - 8;
 #endif
                        }
                        else
--- a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp
@@ -25,17 +25,17 @@ template <typename ALayout,
          typename ComputeTypeB>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
-    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
+__launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
 #endif
-        naive_gemm_kernel(const ADataType* __restrict__ p_a_grid,
-                          const BDataType* __restrict__ p_b_grid,
-                          CDataType* __restrict__ p_c_grid,
-                          index_t m,
-                          index_t n,
-                          index_t k,
-                          const AElementwiseOperation a_element_op,
-                          const BElementwiseOperation b_element_op,
-                          const CDEElementwiseOperation c_element_op)
+    naive_gemm_kernel(const ADataType* __restrict__ p_a_grid,
+                      const BDataType* __restrict__ p_b_grid,
+                      CDataType* __restrict__ p_c_grid,
+                      index_t m,
+                      index_t n,
+                      index_t k,
+                      const AElementwiseOperation a_element_op,
+                      const BElementwiseOperation b_element_op,
+                      const CDEElementwiseOperation c_element_op)
 {
    using RowMajor = ck::tensor_layout::gemm::RowMajor;

--- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp
@@ -23,8 +23,9 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_bf16_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_bf16_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -39,12 +40,13 @@ using device_column_to_image_bf16_instances = std::tuple<
        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_f16_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_f16_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -59,12 +61,13 @@ using device_column_to_image_f16_instances = std::tuple<
        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_f32_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_f32_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -76,12 +79,13 @@ using device_column_to_image_f32_instances = std::tuple<
        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_column_to_image_i8_instances = std::tuple<
-    // clang-format off
+using device_column_to_image_i8_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -97,8 +101,8 @@ using device_column_to_image_i8_instances = std::tuple<
        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
        DeviceColumnToImageImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<16, 16>,     16>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp
@@ -23,8 +23,9 @@ template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_bf16_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_bf16_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -38,12 +39,13 @@ using device_image_to_column_bf16_instances = std::tuple<
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,    64,    64, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,       BF16,        BF16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_f16_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_f16_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -58,12 +60,13 @@ using device_image_to_column_f16_instances = std::tuple<
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,    64,    64, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F16,         F16,   256,   128,   128, S<16, 16>,     8>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_f32_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_f32_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -75,12 +78,13 @@ using device_image_to_column_f32_instances = std::tuple<
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   128,    32,    64,  S<8, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,    64,    64, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,        F32,         F32,   256,   128,   128, S<16, 16>,     4>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial, typename InLayout>
-using device_image_to_column_i8_instances = std::tuple<
-    // clang-format off
+using device_image_to_column_i8_instances =
+    std::tuple<
+        // clang-format off
        //#####################|        Num| InLayout| InDataType| OutDataType| Block|  MPer|  KPer|    Thread| Scalar|
        //#####################|        Dim|         |           |            |  Size| Block| Block|   Cluster|    Per|
        //#####################|    Spatial|         |           |            |      |      |      |   Lengths| Vector|
@@ -96,8 +100,8 @@ using device_image_to_column_i8_instances = std::tuple<
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     4>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   128,   128, S<16, 16>,     8>,
        DeviceImageToColumnImpl<NDimSpatial, InLayout,     int8_t,      int8_t,   256,   256,   256, S<16, 16>,     16>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_exp_gemm_xdl_universal_km_kn_mn_instance.hpp
@@ -38,8 +38,9 @@ static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;

 template <typename InOutDataType, GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_km_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+using device_gemm_xdl_universal_km_kn_mn_comp_instances =
+    std::tuple<
+        // clang-format off
        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
@@ -56,8 +57,8 @@ using device_gemm_xdl_universal_km_kn_mn_comp_instances = std::tuple<
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
        DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Col,     Row, Tuple<>,     Row,     InOutDataType, InOutDataType, Tuple<>,  InOutDataType,   F32,     InOutDataType,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               S<4>,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <typename InOutDataType,
          BlockGemmPipelineScheduler BlkGemmPipeSched,
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_dl_instance.hpp
@@ -35,48 +35,51 @@ template <ck::index_t NDimSpatial,
          typename BLayout,
          typename ELayout,
          ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_dl_f32_instances = std::tuple<
-    // clang-format off
+using device_grouped_conv_bwd_weight_dl_f32_instances =
+    std::tuple<
+        // clang-format off
        //############################|        Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1|  M1Per|  N1Per|   KPer|  M1N1Thread|  M1N1Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|         ABlockTransfer|     ABlockTransfer|         ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|         BBlockTransfer|     BBlockTransfer|         BBlockTransfer|   CThreadTransfer| CThreadTransfer|    CThreadTransfer| 
        //############################|        Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   | Thread| Thread| Thread| ClusterM1Xs| ClusterN1Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| SrcDstAccessOrder| SrcDstVectorDim| DstScalarPerVector|
        //############################|    Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |       |       |       |            |            |       _K0_M0_M1_K1|         _K0_M0_M1_K1|   ArrangeOrder|               |           _K0_M0_M1_K1| ContiguousDimOrder|           _K0_M0_M1_K1|       _K0_N0_N1_K1|         _K0_N0_N1_K1|   ArrangeOrder|               |           _K0_N0_N1_K1| ContiguousDimOrder|           _K0_N0_N1_K1|                  |                |                   |
        //############################|           |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |       |       |       |            |            |                   |                     |               |               |                       |                   |                       |                   |                     |               |               |                       |                   |                       |                  |                |                   |
        // generic instance
        DeviceGroupedConvBwdWeight_Dl< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,    16,  1,      4,      4,      1,     S<8, 2>,     S<8, 2>,   S<1, 8, 1, 1, 1>,   S<1, 2, 1, 128, 1>, S<0, 2, 3, 1, 4>, S<0, 2, 3, 1, 4>,   S<1, 1, 1, 1, 1>,   S<0, 2, 3, 1, 4>,       S<1, 1, 1, 1, 1>,   S<1, 1, 1, 8, 1>,   S<1, 16, 1, 16, 1>, S<0, 1, 4, 2, 3>, S<0, 1, 4, 2, 3>,   S<1, 1, 1, 1, 1>,   S<0, 1, 4, 2, 3>,       S<1, 1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,             5,                   1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
          typename ELayout,
          ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_dl_f16_instances = std::tuple<
-    // clang-format off
+using device_grouped_conv_bwd_weight_dl_f16_instances =
+    std::tuple<
+        // clang-format off
        //############################|        Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1|  M1Per|  N1Per|   KPer|  M1N1Thread|  M1N1Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|         ABlockTransfer|     ABlockTransfer|         ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|         BBlockTransfer|     BBlockTransfer|         BBlockTransfer|   CThreadTransfer| CThreadTransfer|    CThreadTransfer| 
        //############################|        Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   | Thread| Thread| Thread| ClusterM1Xs| ClusterN1Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| SrcDstAccessOrder| SrcDstVectorDim| DstScalarPerVector|
        //############################|    Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |       |       |       |            |            |       _K0_M0_M1_K1|         _K0_M0_M1_K1|   ArrangeOrder|               |           _K0_M0_M1_K1| ContiguousDimOrder|           _K0_M0_M1_K1|       _K0_N0_N1_K1|         _K0_N0_N1_K1|   ArrangeOrder|               |           _K0_N0_N1_K1| ContiguousDimOrder|           _K0_N0_N1_K1|                  |                |                   |
        //############################|           |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |       |       |       |            |            |                   |                     |               |               |                       |                   |                       |                   |                     |               |               |                       |                   |                       |                  |                |                   |
        // generic instance
        DeviceGroupedConvBwdWeight_Dl< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,    16,  1,      4,      4,      1,     S<8, 2>,     S<8, 2>,   S<1, 8, 1, 1, 1>,   S<1, 2, 1, 128, 1>, S<0, 2, 3, 1, 4>, S<0, 2, 3, 1, 4>,   S<1, 1, 1, 1, 1>,   S<0, 2, 3, 1, 4>,       S<1, 1, 1, 1, 1>,   S<1, 1, 1, 8, 1>,   S<1, 16, 1, 16, 1>, S<0, 1, 4, 2, 3>, S<0, 1, 4, 2, 3>,   S<1, 1, 1, 1, 1>,   S<0, 1, 4, 2, 3>,       S<1, 1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,             5,                   1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 template <ck::index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
          typename ELayout,
          ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_dl_bf16_instances = std::tuple<
-    // clang-format off
+using device_grouped_conv_bwd_weight_dl_bf16_instances =
+    std::tuple<
+        // clang-format off
        //############################|        Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1|  M1Per|  N1Per|   KPer|  M1N1Thread|  M1N1Thread|     ABlockTransfer|       ABlockTransfer| ABlockTransfer| ABlockTransfer|         ABlockTransfer|     ABlockTransfer|         ABlockTransfer|     BBlockTransfer|       BBlockTransfer| BBlockTransfer| BBlockTransfer|         BBlockTransfer|     BBlockTransfer|         BBlockTransfer|   CThreadTransfer| CThreadTransfer|    CThreadTransfer| 
        //############################|        Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   | Thread| Thread| Thread| ClusterM1Xs| ClusterN1Xs| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| ThreadSliceLengths| ThreadClusterLengths|  ThreadCluster| SrcAccessOrder| SrcVectorTensorLengths|    SrcVectorTensor| DstVectorTensorLengths| SrcDstAccessOrder| SrcDstVectorDim| DstScalarPerVector|
        //############################|    Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |       |       |       |            |            |       _K0_M0_M1_K1|         _K0_M0_M1_K1|   ArrangeOrder|               |           _K0_M0_M1_K1| ContiguousDimOrder|           _K0_M0_M1_K1|       _K0_N0_N1_K1|         _K0_N0_N1_K1|   ArrangeOrder|               |           _K0_N0_N1_K1| ContiguousDimOrder|           _K0_N0_N1_K1|                  |                |                   |
        //############################|           |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |       |       |       |            |            |                   |                     |               |               |                       |                   |                       |                   |                     |               |               |                       |                   |                       |                  |                |                   |
        // generic instance
        DeviceGroupedConvBwdWeight_Dl< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,    16,  1,      4,      4,      1,     S<8, 2>,     S<8, 2>,   S<1, 8, 1, 1, 1>,   S<1, 2, 1, 128, 1>, S<0, 2, 3, 1, 4>, S<0, 2, 3, 1, 4>,   S<1, 1, 1, 1, 1>,   S<0, 2, 3, 1, 4>,       S<1, 1, 1, 1, 1>,   S<1, 1, 1, 8, 1>,   S<1, 16, 1, 16, 1>, S<0, 1, 4, 2, 3>, S<0, 1, 4, 2, 3>,   S<1, 1, 1, 1, 1>,   S<0, 1, 4, 2, 3>,       S<1, 1, 1, 1, 1>, S<0, 1, 2, 3, 4, 5>,             5,                   1>
-    // clang-format on
-    >;
+        // clang-format on
+        >;

 } // namespace instance
 } // namespace device
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_wmma_instance.hpp
@@ -37,9 +37,8 @@ template <index_t NDSpatial,
          typename BLayout,
          typename CLayout,
          ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_wmma_f16_instances =
-    std::tuple<
-        // clang-format off
+using device_grouped_conv_bwd_weight_wmma_f16_instances = std::tuple<
+    // clang-format off
        //#####################################|    NumDim|       A|       B|       C| AData| BData|  CData| AccData|            A|           B|            C|    ConvForward| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|       CShuffle|       CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //#####################################|   Spatial|  Layout|  Layout|  Layout|  Type|  Type|   Type|    Type|  Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave|            _MBlock_MPerBlock| ScalarPerVector|
        //#####################################|          |        |        |        |      |      |       |        |    Operation|   Operation|    Operation|               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     PerShuffle|     PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
@@ -71,17 +70,16 @@ using device_grouped_conv_bwd_weight_wmma_f16_instances =
        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    32,     8,  8,    16,   16,       4,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    16,     8,  8,    16,   16,       4,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout,  F16,   F16,  F16,  F32,  PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    16,     8,  8,    16,   16,       2,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
-        // clang-format on
-        >;
+    // clang-format on
+    >;

 template <index_t NDSpatial,
          typename ALayout,
          typename BLayout,
          typename CLayout,
          ConvolutionBackwardWeightSpecialization ConvSpec>
-using device_grouped_conv_bwd_weight_wmma_i8_instances =
-    std::tuple<
-        // clang-format off
+using device_grouped_conv_bwd_weight_wmma_i8_instances = std::tuple<
+    // clang-format off
        //#####################################|    NumDim|       A|       B|       C| AData| BData|  CData| AccData|            A|           B|            C|    ConvForward| Block|  MPer|  NPer|  KPer| K1|  MPer| NPer| MRepeat| NRepeat|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|       CShuffle|       CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //#####################################|   Spatial|  Layout|  Layout|  Layout|  Type|  Type|   Type|    Type|  Elementwise| Elementwise|  Elementwise| Specialization|  Size| Block| Block| Block|   |  WMMA| WMMA|        |        |   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MRepeatPerWave| NRepeatPerWave|            _MBlock_MPerBlock| ScalarPerVector|
        //#####################################|          |        |        |        |      |      |       |        |    Operation|   Operation|    Operation|               |      |      |      |      |   |      |     |        |        | Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     PerShuffle|     PerShuffle|            _NBlock_NPerBlock|      _NPerBlock|
@@ -110,8 +108,8 @@ using device_grouped_conv_bwd_weight_wmma_i8_instances =
        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    64,     8,   8,    16,   16,       4,       4,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,           16,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    32,    32,     8,   8,    16,   16,       2,       2,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            8,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>,  
        DeviceGroupedConvBwdWeight_Wmma_CShuffle<NDSpatial, ALayout, BLayout, CLayout, I8,    I8,  I8,  I32,    PassThrough, PassThrough, PassThrough,       ConvSpec,           32,    64,    16,     8,   8,    16,   16,       4,       1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              8,         1,      S<8, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,            4,              8,         1,           1,           1,               S<1, 16, 1, 2>,               8>
-        // clang-format on
-        >;
+    // clang-format on
+    >;

 } // namespace instance
 } // namespace device