Stream-K Reduction option as Runtime parameter and Compilation Error Fix (SK- Reduction) (#2145)

* reduction is passed as runtime parameter * clang * Update include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_streamk_v3.hpp Co-authored-by: John Afaganis <john.afaganis@amd.com> * Update include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp * remove comment ---------
2026-04-20 14:59:17 +00:00 · 2025-06-11 10:59:44 -07:00
parent 06e0b8436c
commit 6fad1c4874
7 changed files with 216 additions and 101 deletions
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -139,7 +139,8 @@ struct DynamicBuffer
    template <InMemoryDataOperationEnum Op,
              typename X,
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
-                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value ||
+                                     !is_native_type<X>(),
                                 bool>::type = false>
    __host__ __device__ void Update(IndexType i, bool is_valid_element, const X& x)
    {
@@ -159,7 +160,37 @@ struct DynamicBuffer
        {
            auto tmp       = this->template Get<X>(i, is_valid_element);
            using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
-            // handle bfloat addition
+
+#if defined(__gfx942__) || defined(__gfx950__)
+
+            // Properly handle addition for all low-precision types
+            if constexpr(is_same_v<scalar_t, bhalf_t> || is_same_v<scalar_t, half_t>)
+            {
+                if constexpr(is_scalar_type<X>::value)
+                {
+                    // Scalar type: Convert to float, add, convert back
+                    auto result =
+                        type_convert<X>(type_convert<float>(x) + type_convert<float>(tmp));
+                    this->template Set<X>(i, is_valid_element, result);
+                }
+                else
+                {
+                    // Vector type
+                    constexpr auto vector_size = scalar_type<remove_cvref_t<X>>::vector_size;
+                    const vector_type<scalar_t, vector_size> a_vector{tmp};
+                    const vector_type<scalar_t, vector_size> b_vector{x};
+
+                    // Process each element of the vector in higher precision
+                    static_for<0, vector_size, 1>{}([&](auto idx) {
+                        auto result = type_convert<scalar_t>(
+                            type_convert<float>(a_vector.template AsType<scalar_t>()[idx]) +
+                            type_convert<float>(b_vector.template AsType<scalar_t>()[idx]));
+                        this->template Set<scalar_t>(i + idx, is_valid_element, result);
+                    });
+                }
+            }
+#else
+            //   handle bfloat addition
            if constexpr(is_same_v<scalar_t, bhalf_t>)
            {
                if constexpr(is_scalar_type<X>::value)
@@ -187,6 +218,8 @@ struct DynamicBuffer
            {
                this->template Set<X>(i, is_valid_element, x + tmp);
            }
+
+#endif
        }
    }

@@ -240,9 +273,20 @@ struct DynamicBuffer
        if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
        {
            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+            using vector_t = typename vector_type_maker<remove_cvref_t<T>, t_per_x>::type::type;
+            vector_t tmp;
+
+            if constexpr(is_same_v<remove_cvref_t<X>, vector_t>)
+            {
+                tmp = x;
+            }
+            else
+            {
+                __builtin_memcpy(&tmp, &x, sizeof(vector_t));
+            }

            amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
-                x, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
+                tmp, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
        }
        else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
                          is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&