Conv:TF32: add more instances - 2 (#2879)

* add instances of device_grouped_conv_fwd_xdl_f32_comp_instances * add instances of device_grouped_conv_fwd_xdl_f32_tf32_mem_instances * add instances of device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances * tf32:conv:add instances for base class DeviceConvFwd * tf32:conv:add instances for base class DeviceGroupedConvBwdDataMultipleD * tf32:conv:add instances for base class DeviceGroupedConvBwdWeight * add tf32 in profiler * remove gnhwc/ngchw/ngcdhw instances * remove non-ndhwgc/nhwgc/nhwc instances * add check in IsSupportedArgument()
2026-04-20 06:49:15 +00:00 · 2025-10-10 15:28:17 +08:00
parent ad7a215aba
commit fada1a3cae
56 changed files with 2119 additions and 152 deletions
--- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
@@ -29,7 +29,8 @@ template <ck::index_t NDimSpatial,
          typename InLayout,
          typename OutDataType,
          typename WeiDataType,
-          typename InDataType>
+          typename InDataType,
+          typename ComputeDataType = InDataType>
 bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                        int init_method,
                                        bool do_log,
@@ -96,7 +97,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
                                                                         OutDataType,
                                                                         InElementOp,
                                                                         WeiElementOp,
-                                                                         OutElementOp>();
+                                                                         OutElementOp,
+                                                                         0,
+                                                                         0,
+                                                                         0,
+                                                                         ComputeDataType>();

        auto ref_invoker = ref_conv.MakeInvoker();

@@ -171,9 +176,13 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
            {
                in_device_buf.FromDevice(in_device.mData.data());

-                using ComputeType = std::conditional_t<sizeof(OutDataType) < sizeof(WeiDataType),
-                                                            OutDataType,
-                                                            WeiDataType>;
+                using ComputeType_ = std::conditional_t<sizeof(OutDataType) < sizeof(WeiDataType),
+                                                        OutDataType,
+                                                        WeiDataType>;
+                using ComputeType =
+                    std::conditional_t<sizeof(ComputeType_) < sizeof(ComputeDataType),
+                                       ComputeType_,
+                                       ComputeDataType>;
                using AccDataType =
                    std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
                const index_t num_accums = conv_param.K_;
@@ -222,18 +231,21 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
    };

    // do GEMM
-    using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
-                                                                                     OutLayout,
-                                                                                     WeiLayout,
-                                                                                     ck::Tuple<>,
-                                                                                     InLayout,
-                                                                                     OutDataType,
-                                                                                     WeiDataType,
-                                                                                     ck::Tuple<>,
-                                                                                     InDataType,
-                                                                                     OutElementOp,
-                                                                                     WeiElementOp,
-                                                                                     InElementOp>;
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
+                                                                        OutLayout,
+                                                                        WeiLayout,
+                                                                        ck::Tuple<>,
+                                                                        InLayout,
+                                                                        OutDataType,
+                                                                        WeiDataType,
+                                                                        ck::Tuple<>,
+                                                                        InDataType,
+                                                                        OutElementOp,
+                                                                        WeiElementOp,
+                                                                        InElementOp,
+                                                                        ComputeDataType,
+                                                                        ComputeDataType>;

    // get device op instances
    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
--- a/profiler/src/profile_grouped_conv_bwd_data.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_data.cpp
@@ -21,9 +21,10 @@ enum struct ConvLayout

 enum struct ConvDataType
 {
-    F32_F32_F32,    // 0
-    F16_F16_F16,    // 1
-    BF16_BF16_BF16, // 2
+    F32_F32_F32,      // 0
+    F16_F16_F16,      // 1
+    BF16_BF16_BF16,   // 2
+    F32_F32_F32_TF32, // 3
 };

 #define OP_NAME "grouped_conv_bwd_data"
@@ -37,6 +38,7 @@ static void print_helper_msg()
        << "arg2: data type (0: Output fp32, Weight fp32, Input fp32\n"
        << "                 1: Output fp16, Weight fp16, Input fp16\n"
        << "                 2: Output bf16, Weight bf16, Input bf16\n"
+        << "                 3: Output fp32, Weight fp32, Input fp32, Compute tf32)\n"
        << "arg3: tensor layout (0: Output[G, N, Ho, Wo, C], Weight[G, K, Y, X, C], Input[G, N, Hi, Wi, K]\n"
        << "                     1: Output[N, Ho, Wo, G, C], Weight[G, K, Y, X, C], Input[N, Hi, Wi, G, K])\n"
        << "                     2: Output[N, G, C, Ho, Wo], Weight[G, K, Y, X, C], Input[N, G, K, Hi, Wi])\n"
@@ -82,6 +84,9 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
    using F32  = float;
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
+#if defined(__gfx942__)
+    using TF32 = ck::tf32_t;
+#endif

    using namespace ck::tensor_layout::convolution;

@@ -94,16 +99,18 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                       auto in_layout,
                       auto wei_type,
                       auto out_type,
-                       auto in_type) {
+                       auto in_type,
+                       auto compute_type) {
        constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value;

        using OutLayout = decltype(out_layout);
        using WeiLayout = decltype(wei_layout);
        using InLayout  = decltype(in_layout);

-        using OutDataType = decltype(out_type);
-        using WeiDataType = decltype(wei_type);
-        using InDataType  = decltype(in_type);
+        using OutDataType     = decltype(out_type);
+        using WeiDataType     = decltype(wei_type);
+        using InDataType      = decltype(in_type);
+        using ComputeDataType = decltype(compute_type);

        bool pass = ck::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
                                                                     OutLayout,
@@ -111,7 +118,8 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
                                                                     InLayout,
                                                                     OutDataType,
                                                                     WeiDataType,
-                                                                     InDataType>(
+                                                                     InDataType,
+                                                                     ComputeDataType>(
            do_verification, init_method, do_log, time_kernel, params, split_k);

        return pass ? 0 : 1;
@@ -123,60 +131,84 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{});
+                return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F16{}, F16{}, F16{});
+                return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, BF16{}, BF16{}, BF16{});
+                return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I2, GNHWK{}, GKYXC{}, GNHWC{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
        else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{});
+                return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F16{}, F16{}, F16{});
+                return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{});
+                return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I2, NHWGK{}, GKYXC{}, NHWGC{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
        else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW)
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, F32{}, F32{}, F32{});
+                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, F16{}, F16{}, F16{});
+                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, BF16{}, BF16{}, BF16{});
+                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I2, NGKHW{}, GKYXC{}, NGCHW{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
        else if(layout == ConvLayout::NGCHW_GKCYX_NGKHW)
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F32{}, F32{}, F32{});
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F16{}, F16{}, F16{});
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, BF16{}, BF16{}, BF16{});
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I2, NGKHW{}, GKCYX{}, NGCHW{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
    }
@@ -186,60 +218,84 @@ int profile_grouped_conv_bwd_data(int argc, char* argv[])
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F32{}, F32{}, F32{});
+                return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F16{}, F16{}, F16{});
+                return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, BF16{}, BF16{}, BF16{});
+                return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I3, GNDHWK{}, GKZYXC{}, GNDHWC{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
        else if(layout == ConvLayout::NHWGC_GKYXC_NHWGK)
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F32{}, F32{}, F32{});
+                return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F16{}, F16{}, F16{});
+                return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, BF16{}, BF16{}, BF16{});
+                return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I3, NDHWGK{}, GKZYXC{}, NDHWGC{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
        else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW)
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, F32{}, F32{}, F32{});
+                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, F16{}, F16{}, F16{});
+                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, BF16{}, BF16{}, BF16{});
+                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I3, NGKDHW{}, GKZYXC{}, NGCDHW{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
        else if(layout == ConvLayout::NGCHW_GKYXC_NGKHW)
        {
            if(data_type == ConvDataType::F32_F32_F32)
            {
-                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F32{}, F32{}, F32{});
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F32{}, F32{}, F32{}, F32{});
            }
            else if(data_type == ConvDataType::F16_F16_F16)
            {
-                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F16{}, F16{}, F16{});
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F16{}, F16{}, F16{}, F16{});
            }
            else if(data_type == ConvDataType::BF16_BF16_BF16)
            {
-                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, BF16{}, BF16{}, BF16{});
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, BF16{}, BF16{}, BF16{}, BF16{});
+            }
+            else if(data_type == ConvDataType::F32_F32_F32_TF32)
+            {
+#if defined(__gfx942__)
+                return profile(I3, NGKDHW{}, GKCZYX{}, NGCDHW{}, F32{}, F32{}, F32{}, TF32{});
+#endif
            }
        }
    }
--- a/profiler/src/profile_grouped_conv_bwd_weight.cpp
+++ b/profiler/src/profile_grouped_conv_bwd_weight.cpp
@@ -28,6 +28,7 @@ enum struct ConvDataType
    F16_F16_F16_BF8_F8, // 3
    I8_I8_I8,           // 4
    BF16_BF16_BF16,     // 5
+    F32_F32_F32_TF32,   // 6
 };

 #define OP_NAME "grouped_conv_bwd_weight"
@@ -41,7 +42,8 @@ static void print_helper_msg()
              << "                 2: Input bf16, Weight fp32, Output bf16\n"
              << "                 3: Input fp16, Weight fp16, Output fp16, Gemm bf8@fp8\n"
              << "                 4: Input int8, Weight int8, Output int8\n"
-              << "                 5: Input bf16, Weight bf16, Output bf16)\n"
+              << "                 5: Input bf16, Weight bf16, Output bf16\n"
+              << "                 6: Input fp32, Weight fp32, Output fp32, Compute tf32)\n"
              << "arg3: tensor layout (0: Input[G, N, C, Hi, Wi], Weight[G, K, C, Y, X], Output[G, "
                 "N, K, Ho, Wo]\n"
              << "                     1: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, "
@@ -97,6 +99,9 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
    using BF16 = ck::bhalf_t;
    using F8   = ck::f8_t;
    using BF8  = ck::bf8_t;
+#if defined(__gfx942__)
+    using TF32 = ck::tf32_t;
+#endif

    using namespace ck::tensor_layout::convolution;

@@ -155,6 +160,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
            // fp32 atomic add is used for weight tensor in bf16 kernel
            return profile(I1, GNWC{}, GKXC{}, GNWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
@@ -171,6 +182,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
            // fp32 atomic add is used for weight tensor in bf16 kernel
            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
@@ -191,6 +208,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
        {
            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW)
    {
@@ -218,6 +241,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
        {
            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
@@ -239,6 +268,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
            return profile(
                I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, GNDHWC{}, GKZYXC{}, GNDHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
@@ -269,6 +304,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
            return profile(
                I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 3 && layout == ConvLayout::NGCHW_GKYXC_NGKHW)
    {
@@ -297,6 +338,12 @@ int profile_grouped_conv_bwd_weight(int argc, char* argv[])
        {
            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, F32{}, F32{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, NGCDHW{}, GKCZYX{}, NGKDHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }

    std::cout << "this data_type & layout is not implemented" << std::endl;
--- a/profiler/src/profile_grouped_conv_fwd.cpp
+++ b/profiler/src/profile_grouped_conv_fwd.cpp
@@ -226,6 +226,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I1, GNWC{}, GKXC{}, GNWK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I1, GNWC{}, GKXC{}, GNWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 2 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
@@ -245,6 +251,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, GNHWC{}, GKYXC{}, GNHWK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 3 && layout == ConvLayout::GNHWC_GKYXC_GNHWK)
    {
@@ -292,6 +304,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I1, NWGC{}, GKXC{}, NWGK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I1, NWGC{}, GKXC{}, NWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 2 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
@@ -311,6 +329,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, INT8{}, INT8{}, INT8{}, INT8{}, INT8{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKYXC_NGKHW)
    {
@@ -326,6 +350,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, NGCHW{}, GKYXC{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 2 && layout == ConvLayout::NGCHW_GKCYX_NGKHW)
    {
@@ -341,6 +371,12 @@ int profile_grouped_conv_fwd(int argc, char* argv[])
        {
            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, NGCHW{}, GKCYX{}, NGKHW{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
--- a/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp
+++ b/profiler/src/profile_grouped_conv_fwd_bias_clamp.cpp
@@ -20,14 +20,15 @@ enum struct ConvLayout

 enum struct ConvDataType
 {
-    F32_F32_F32,    // 0
-    F16_F16_F16,    // 1
-    BF16_BF16_BF16, // 2
-    INT8_INT8_INT8, // 3
-    F8_F8_F8,       // 4
-    BF8_BF8_F8,     // 5
-    F8_BF8_F8,      // 6
-    BF8_F8_F8,      // 7
+    F32_F32_F32,      // 0
+    F16_F16_F16,      // 1
+    BF16_BF16_BF16,   // 2
+    INT8_INT8_INT8,   // 3
+    F8_F8_F8,         // 4
+    BF8_BF8_F8,       // 5
+    F8_BF8_F8,        // 6
+    BF8_F8_F8,        // 7
+    F32_F32_F32_TF32, // 8
 };

 enum struct IndexType
@@ -51,7 +52,8 @@ static void print_helper_msg()
        << "                 4: Input fp8, Weight fp8, Output fp8\n"
        << "                 5: Input bf8, Weight bf8, Output fp8\n"
        << "                 6: Input fp8, Weight bf8, Output fp8\n"
-        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
+        << "                 7: Input bf8, Weight fp8, Output fp8\n"
+        << "                 8: Input fp32, Weight fp32, Output fp32, Compute tf32)\n"
        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n"
        << "                     2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
@@ -103,6 +105,9 @@ int grouped_conv_fwd_bias_clamp(int argc, char* argv[])
    using F32  = float;
    using BF16 = ck::bhalf_t;
    using F16  = ck::half_t;
+#if defined(__gfx942__)
+    using TF32 = ck::tf32_t;
+#endif

    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
    using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
@@ -165,6 +170,12 @@ int grouped_conv_fwd_bias_clamp(int argc, char* argv[])
        {
            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
@@ -181,6 +192,12 @@ int grouped_conv_fwd_bias_clamp(int argc, char* argv[])
            return profile(
                I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }

    std::cout << "this data_type & layout is not implemented" << std::endl;
--- a/profiler/src/profile_grouped_conv_fwd_clamp.cpp
+++ b/profiler/src/profile_grouped_conv_fwd_clamp.cpp
@@ -20,14 +20,15 @@ enum struct ConvLayout

 enum struct ConvDataType
 {
-    F32_F32_F32,    // 0
-    F16_F16_F16,    // 1
-    BF16_BF16_BF16, // 2
-    INT8_INT8_INT8, // 3
-    F8_F8_F8,       // 4
-    BF8_BF8_F8,     // 5
-    F8_BF8_F8,      // 6
-    BF8_F8_F8,      // 7
+    F32_F32_F32,      // 0
+    F16_F16_F16,      // 1
+    BF16_BF16_BF16,   // 2
+    INT8_INT8_INT8,   // 3
+    F8_F8_F8,         // 4
+    BF8_BF8_F8,       // 5
+    F8_BF8_F8,        // 6
+    BF8_F8_F8,        // 7
+    F32_F32_F32_TF32, // 8
 };

 enum struct IndexType
@@ -51,7 +52,8 @@ static void print_helper_msg()
        << "                 4: Input fp8, Weight fp8, Output fp8\n"
        << "                 5: Input bf8, Weight bf8, Output fp8\n"
        << "                 6: Input fp8, Weight bf8, Output fp8\n"
-        << "                 7: Input bf8, Weight fp8, Output fp8)\n"
+        << "                 7: Input bf8, Weight fp8, Output fp8\n"
+        << "                 8: Input fp32, Weight fp32, Output fp32, Compute tf32)\n"
        << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Weight[G, K, Y, X, C], Output[G, N, Ho, Wo, K]\n"
        << "                     1: Input[N, Hi, Wi, G, C], Weight[G, K, Y, X, C], Output[N, Ho, Wo, G, K]\n"
        << "                     2: Input[N, G, C, Hi, Wi], Weight[G, K, Y, X, C], Output[N, "
@@ -103,6 +105,9 @@ int grouped_conv_fwd_clamp(int argc, char* argv[])
    using F32  = float;
    using BF16 = ck::bhalf_t;
    using F16  = ck::half_t;
+#if defined(__gfx942__)
+    using TF32 = ck::tf32_t;
+#endif

    using GKZYXC = ck::tensor_layout::convolution::GKZYXC;
    using NDHWGC = ck::tensor_layout::convolution::NDHWGC;
@@ -168,6 +173,12 @@ int grouped_conv_fwd_clamp(int argc, char* argv[])
        {
            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }
    else if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
    {
@@ -184,6 +195,12 @@ int grouped_conv_fwd_clamp(int argc, char* argv[])
            return profile(
                I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
        }
+        else if(data_type == ConvDataType::F32_F32_F32_TF32)
+        {
+#if defined(__gfx942__)
+            return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, F32{}, F32{}, F32{}, TF32{}, TF32{});
+#endif
+        }
    }

    std::cout << "this data_type & layout is not implemented" << std::endl;