Replace grouped conv bwd wei wmmaV3 bilin/scale bf16f32bf16 support with bf16bf16bf16 (#3470)

* Replace grouped convolution bwd weight wmma v3 bilinear and scale bf16f32bf16 support with bf16bf16bf16 support. Update tests. * Tentative fix for bwd weight bilinear bf16bf16bf16, seems like the bilinear elementwise overload for this case (bf16, f32 accu, bf16) was wrong. [ROCm/composable_kernel commit: 88ae445580]
2026-05-20 12:59:49 +00:00 · 2025-12-29 12:58:29 +01:00
parent 13134864cc
commit ac28f1b016
10 changed files with 47 additions and 46 deletions
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_multiple_d_wmma_cshuffle_v3.hpp
@@ -746,7 +746,6 @@ struct DeviceGroupedConvBwdWeightMultipleD_Wmma_CShuffleV3
                                  arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2);

            AccDataType* p_e_grid = type_convert<AccDataType*>(arg.p_workspace_);
-            ;

            // Convolution kernel dispatch
            typename GridwiseGemm::Argument gemm_arg{
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -348,9 +348,7 @@ struct Bilinear
    __host__ __device__ constexpr void
    operator()<bhalf_t, float, bhalf_t>(bhalf_t& y, const float& x0, const bhalf_t& x1) const
    {
-        const float x1_tmp = ck::type_convert<float>(x1);
-        const float y_tmp  = alpha_ * x0 + beta_ * x1_tmp;
-        y                  = y_tmp;
+        y = type_convert<bhalf_t>(alpha_ * x0 + beta_ * ck::type_convert<float>(x1));
    };

    template <>