Conv:TF32: add more instances - 2 (#2879)

* add instances of device_grouped_conv_fwd_xdl_f32_comp_instances * add instances of device_grouped_conv_fwd_xdl_f32_tf32_mem_instances * add instances of device_grouped_conv_fwd_xdl_large_tensor_f32_tf32_instances * tf32:conv:add instances for base class DeviceConvFwd * tf32:conv:add instances for base class DeviceGroupedConvBwdDataMultipleD * tf32:conv:add instances for base class DeviceGroupedConvBwdWeight * add tf32 in profiler * remove gnhwc/ngchw/ngcdhw instances * remove non-ndhwgc/nhwgc/nhwc instances * add check in IsSupportedArgument() [ROCm/composable_kernel commit: fada1a3cae]
2026-05-18 20:09:25 +00:00 · 2025-10-10 15:28:17 +08:00
parent c7f3bcc81e
commit c1780cfebe
56 changed files with 2119 additions and 152 deletions
--- a/include/ck/library/utility/check_err.hpp
+++ b/include/ck/library/utility/check_err.hpp
@@ -31,13 +31,15 @@ double get_relative_threshold(const int number_of_accumulations = 1)
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
    using F32  = float;
+    using TF32 = ck::tf32_t;
    using I8   = int8_t;
    using I32  = int32_t;

    static_assert(is_same_v<ComputeDataType, F4> || is_same_v<ComputeDataType, F8> ||
                      is_same_v<ComputeDataType, F16> || is_same_v<ComputeDataType, BF16> ||
-                      is_same_v<ComputeDataType, F32> || is_same_v<ComputeDataType, I8> ||
-                      is_same_v<ComputeDataType, I32> || is_same_v<ComputeDataType, int>,
+                      is_same_v<ComputeDataType, F32> || is_same_v<ComputeDataType, TF32> ||
+                      is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
+                      is_same_v<ComputeDataType, int>,
                  "Warning: Unhandled ComputeDataType for setting up the relative threshold!");
    double compute_error = 0;
    if constexpr(is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
@@ -52,8 +54,9 @@ double get_relative_threshold(const int number_of_accumulations = 1)

    static_assert(is_same_v<OutDataType, F4> || is_same_v<OutDataType, F8> ||
                      is_same_v<OutDataType, F16> || is_same_v<OutDataType, BF16> ||
-                      is_same_v<OutDataType, F32> || is_same_v<OutDataType, I8> ||
-                      is_same_v<OutDataType, I32> || is_same_v<OutDataType, int>,
+                      is_same_v<OutDataType, F32> || is_same_v<ComputeDataType, TF32> ||
+                      is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
+                      is_same_v<OutDataType, int>,
                  "Warning: Unhandled OutDataType for setting up the relative threshold!");
    double output_error = 0;
    if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
@@ -69,8 +72,9 @@ double get_relative_threshold(const int number_of_accumulations = 1)

    static_assert(is_same_v<AccDataType, F4> || is_same_v<AccDataType, F8> ||
                      is_same_v<AccDataType, F16> || is_same_v<AccDataType, BF16> ||
-                      is_same_v<AccDataType, F32> || is_same_v<AccDataType, I8> ||
-                      is_same_v<AccDataType, I32> || is_same_v<AccDataType, int>,
+                      is_same_v<AccDataType, F32> || is_same_v<ComputeDataType, TF32> ||
+                      is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
+                      is_same_v<AccDataType, int>,
                  "Warning: Unhandled AccDataType for setting up the relative threshold!");
    double acc_error = 0;
    if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
@@ -93,13 +97,15 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
    using F16  = ck::half_t;
    using BF16 = ck::bhalf_t;
    using F32  = float;
+    using TF32 = ck::tf32_t;
    using I8   = int8_t;
    using I32  = int32_t;

    static_assert(is_same_v<ComputeDataType, F4> || is_same_v<ComputeDataType, F8> ||
                      is_same_v<ComputeDataType, F16> || is_same_v<ComputeDataType, BF16> ||
-                      is_same_v<ComputeDataType, F32> || is_same_v<ComputeDataType, I8> ||
-                      is_same_v<ComputeDataType, I32> || is_same_v<ComputeDataType, int>,
+                      is_same_v<ComputeDataType, F32> || is_same_v<ComputeDataType, TF32> ||
+                      is_same_v<ComputeDataType, I8> || is_same_v<ComputeDataType, I32> ||
+                      is_same_v<ComputeDataType, int>,
                  "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
    auto expo            = std::log2(std::abs(max_possible_num));
    double compute_error = 0;
@@ -115,8 +121,9 @@ double get_absolute_threshold(const double max_possible_num, const int number_of

    static_assert(is_same_v<OutDataType, F4> || is_same_v<OutDataType, F8> ||
                      is_same_v<OutDataType, F16> || is_same_v<OutDataType, BF16> ||
-                      is_same_v<OutDataType, F32> || is_same_v<OutDataType, I8> ||
-                      is_same_v<OutDataType, I32> || is_same_v<OutDataType, int>,
+                      is_same_v<OutDataType, F32> || is_same_v<ComputeDataType, TF32> ||
+                      is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
+                      is_same_v<OutDataType, int>,
                  "Warning: Unhandled OutDataType for setting up the absolute threshold!");
    double output_error = 0;
    if constexpr(is_same_v<OutDataType, I8> || is_same_v<OutDataType, I32> ||
@@ -132,8 +139,9 @@ double get_absolute_threshold(const double max_possible_num, const int number_of

    static_assert(is_same_v<AccDataType, F4> || is_same_v<AccDataType, F8> ||
                      is_same_v<AccDataType, F16> || is_same_v<AccDataType, BF16> ||
-                      is_same_v<AccDataType, F32> || is_same_v<AccDataType, I8> ||
-                      is_same_v<AccDataType, I32> || is_same_v<AccDataType, int>,
+                      is_same_v<AccDataType, F32> || is_same_v<ComputeDataType, TF32> ||
+                      is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
+                      is_same_v<AccDataType, int>,
                  "Warning: Unhandled AccDataType for setting up the absolute threshold!");
    double acc_error = 0;
    if constexpr(is_same_v<AccDataType, I8> || is_same_v<AccDataType, I32> ||
@@ -149,11 +157,67 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
    return std::max(acc_error, midway_error);
 }

-template <typename Range, typename RefRange>
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
+typename std::enable_if<
+    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
+        std::is_same_v<ranges::range_value_t<Range>, float> &&
+        std::is_same_v<ComputeDataType, ck::tf32_t>,
+    bool>::type
+check_err(const Range& out,
+          const RefRange& ref,
+          const std::string& msg = "Error: Incorrect results!",
+          double rtol            = 1e-5,
+          double atol            = 3e-5)
+{
+    if(out.size() != ref.size())
+    {
+        std::cerr << msg << " out.size() != ref.size(), :" << out.size() << " != " << ref.size()
+                  << std::endl;
+        return false;
+    }
+
+    bool res{true};
+    int err_count  = 0;
+    double err     = 0;
+    double max_err = std::numeric_limits<double>::min();
+    for(std::size_t i = 0; i < ref.size(); ++i)
+    {
+        const double o = *std::next(std::begin(out), i);
+        const double r = *std::next(std::begin(ref), i);
+        err            = std::abs(o - r);
+        if(err > atol + rtol * std::abs(r) || !std::isfinite(o) || !std::isfinite(r))
+        {
+            max_err = err > max_err ? err : max_err;
+            if(err_count < 5)
+            {
+                std::cerr << msg << std::setw(12) << std::setprecision(7) << " out[" << i
+                          << "] != ref[" << i << "]: " << o << " != " << r << std::endl;
+            }
+            res = false;
+            err_count++;
+        }
+    }
+    if(!res)
+    {
+        const float error_percent =
+            static_cast<float>(err_count) / static_cast<float>(out.size()) * 100.f;
+        std::cerr << "max err: " << max_err;
+        std::cerr << ", number of errors: " << err_count;
+        std::cerr << ", " << error_percent << "% wrong values" << std::endl;
+    }
+    return res;
+}
+
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
 typename std::enable_if<
    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
        std::is_floating_point_v<ranges::range_value_t<Range>> &&
-        !std::is_same_v<ranges::range_value_t<Range>, half_t>,
+        !std::is_same_v<ranges::range_value_t<Range>, half_t> &&
+        !std::is_same_v<ComputeDataType, ck::tf32_t>,
    bool>::type
 check_err(const Range& out,
          const RefRange& ref,
@@ -200,7 +264,9 @@ check_err(const Range& out,
    return res;
 }

-template <typename Range, typename RefRange>
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
 typename std::enable_if<
    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
        std::is_same_v<ranges::range_value_t<Range>, bhalf_t>,
@@ -251,7 +317,9 @@ check_err(const Range& out,
    return res;
 }

-template <typename Range, typename RefRange>
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
 typename std::enable_if<
    std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
        std::is_same_v<ranges::range_value_t<Range>, half_t>,
@@ -301,7 +369,9 @@ check_err(const Range& out,
    return res;
 }

-template <typename Range, typename RefRange>
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_integral_v<ranges::range_value_t<Range>> &&
                  !std::is_same_v<ranges::range_value_t<Range>, bhalf_t> &&
@@ -358,7 +428,9 @@ check_err(const Range& out,
    return res;
 }

-template <typename Range, typename RefRange>
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_same_v<ranges::range_value_t<Range>, f8_t>),
                 bool>
@@ -407,7 +479,9 @@ check_err(const Range& out,
    return res;
 }

-template <typename Range, typename RefRange>
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_same_v<ranges::range_value_t<Range>, bf8_t>),
                 bool>
@@ -452,7 +526,9 @@ check_err(const Range& out,
    return res;
 }

-template <typename Range, typename RefRange>
+template <typename Range,
+          typename RefRange,
+          typename ComputeDataType = ranges::range_value_t<Range>>
 std::enable_if_t<(std::is_same_v<ranges::range_value_t<Range>, ranges::range_value_t<RefRange>> &&
                  std::is_same_v<ranges::range_value_t<Range>, f4_t>),
                 bool>
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
@@ -1499,6 +1499,22 @@ struct DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1
        {
            return false;
        }
+        if constexpr(is_same_v<AComputeType, ck::tf32_t> || is_same_v<BComputeType, ck::tf32_t>)
+        {
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<AComputeType, BComputeType>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }

        if constexpr(!IsSplitKSupported)
        {
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_xdl_cshuffle.hpp
@@ -951,6 +951,22 @@ struct DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle
        {
            return false;
        }
+        if constexpr(is_same_v<ComputeTypeA, ck::tf32_t> || is_same_v<ComputeTypeB, ck::tf32_t>)
+        {
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<ComputeTypeA, ComputeTypeB>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
        if constexpr(NDimSpatial == 1)
        {
            if constexpr(!is_GNWC_GKXC_GNWK<InLayout, WeiLayout, OutLayout>())
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
@@ -1687,6 +1687,23 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle
        const index_t GemmK =
            arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);

+        if constexpr(is_same_v<ComputeTypeA, ck::tf32_t> || is_same_v<ComputeTypeB, ck::tf32_t>)
+        {
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<ComputeTypeA, ComputeTypeB>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
        if(get_warp_size() == 64)
        {
            if constexpr(NXdlPerWave64 > 0)
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -950,6 +950,22 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        {
            return false;
        }
+        if constexpr(is_same_v<ComputeTypeA, ck::tf32_t> || is_same_v<ComputeTypeB, ck::tf32_t>)
+        {
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<ComputeTypeA, ComputeTypeB>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
        if constexpr(NDimSpatial == 1)
        {
            if constexpr(!is_GNWC_GKXC_GNWK<InLayout, WeiLayout, OutLayout>())
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -1289,6 +1289,23 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
        const index_t GemmK = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) *
                              arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2);

+        if constexpr(is_same_v<ComputeTypeA, ck::tf32_t> || is_same_v<ComputeTypeB, ck::tf32_t>)
+        {
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<ComputeTypeA, ComputeTypeB>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
        if(get_warp_size() == 64)
        {
            if constexpr(NXdlPerWave64 > 0)
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -1399,6 +1399,25 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
            }
            return false;
        }
+
+        if constexpr(is_same_v<AComputeDataType, ck::tf32_t> ||
+                     is_same_v<BComputeDataType, ck::tf32_t>)
+        {
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<AComputeDataType, BComputeDataType>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
+
        // check ConvolutionForwardSpecialization
        if constexpr(ConvForwardSpecialization ==
                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -820,6 +820,23 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
        {
            return false;
        }
+        if constexpr(is_same_v<AComputeDataType, ck::tf32_t> ||
+                     is_same_v<BComputeDataType, ck::tf32_t>)
+        {
+            if(!is_tf32_supported())
+            {
+                return false;
+            }
+            if constexpr(!is_same_v<AComputeDataType, BComputeDataType>)
+            {
+                if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
+                {
+                    std::cout << "ComputeDataType for A and B should be same while using TF32"
+                              << std::endl;
+                }
+                return false;
+            }
+        }
        // check ConvolutionForwardSpecialization
        if constexpr(ConvForwardSpecialization ==
                     ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp
@@ -280,8 +280,8 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
    using FloatBAdjusted =
        conditional_t<is_same_v<ComputeTypeB, ck::half_t>, ck::bhalf_t, ComputeTypeB>;
 #else
-    using FloatAAdjusted = ComputeTypeA;
-    using FloatBAdjusted = ComputeTypeB;
+    using FloatAAdjusted = conditional_t<is_same_v<ComputeTypeA, ck::tf32_t>, float, ComputeTypeA>;
+    using FloatBAdjusted = conditional_t<is_same_v<ComputeTypeB, ck::tf32_t>, float, ComputeTypeB>;
 #endif

    // M0/M1/M1Padding
@@ -760,19 +760,19 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
        //       register
        // sanity check
        constexpr bool is_single_rate_mfma =
-            (((is_same<FloatAAdjusted, half_t>::value || is_same<FloatAAdjusted, bhalf_t>::value) &&
+            (((is_same<ComputeTypeA, half_t>::value || is_same<ComputeTypeA, bhalf_t>::value) &&
              K1 <= 4) ||
-             (is_same<FloatAAdjusted, int8_t>::value && K1 <= 8) ||
-             ((is_same<FloatAAdjusted, f8_t>::value || is_same<FloatAAdjusted, bf8_t>::value) &&
+             (is_same<ComputeTypeA, int8_t>::value && K1 <= 8) ||
+             ((is_same<ComputeTypeA, f8_t>::value || is_same<ComputeTypeA, bf8_t>::value) &&
              K1 < 32))
                ? true
                : false;
        constexpr auto is_scale_mfma = false;
        constexpr index_t KPack      = math::max(K1,
-                                            MfmaSelector<FloatAAdjusted,
+                                            MfmaSelector<ComputeTypeA,
                                                              MPerXdl,
                                                              NPerXdl,
-                                                              FloatBAdjusted,
+                                                              ComputeTypeB,
                                                              is_single_rate_mfma,
                                                              is_scale_mfma>::selected_mfma.k_per_blk);

@@ -787,7 +787,9 @@ struct GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight
                                                                NPerXdl,
                                                                MRepeat,
                                                                NRepeat,
-                                                                KPack>{};
+                                                                KPack,
+                                                                ComputeTypeA,
+                                                                ComputeTypeB>{};

        auto c_thread_buf = blockwise_gemm.GetCThreadBuffer();

--- a/include/ck/utility/numeric_utils.hpp
+++ b/include/ck/utility/numeric_utils.hpp
@@ -45,6 +45,24 @@ struct NumericUtils<float>
    using bitwise_type                  = uint32_t;
 };

+template <>
+struct NumericUtils<ck::tf32_t>
+{
+    static constexpr int exp            = 8;
+    static constexpr int mant           = 10;
+    static constexpr int bias           = 127;
+    static constexpr uint32_t nan_mask  = 0x7F800000;
+    static constexpr uint32_t head_mask = 0xFF800000;
+    static constexpr uint32_t mant_mask = 0x7FFFFF;
+    static constexpr uint32_t exp_mask  = 0xFF;
+    static constexpr uint32_t Inf       = 0x7F800000;
+    static constexpr uint32_t NegInf    = 0xFF800000;
+    static constexpr uint32_t NaN       = 0x7F800001;
+    static constexpr uint32_t Neg0      = 0x80000000;
+    static constexpr bool has_inf       = true;
+    using bitwise_type                  = uint32_t;
+};
+
 template <>
 struct NumericUtils<half_t>
 {