fixed multiple definition issue of bfp16/fp32 conversion function when building ckProfiler (#51)

* fixed bfloat16 issues * refactor type_convert Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 0a66c54e95]
2026-05-17 19:40:04 +00:00 · 2021-11-16 15:44:17 -06:00
parent ea6fa92eea
commit 1e7102575b
18 changed files with 157 additions and 337 deletions
--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
@@ -1,162 +1,6 @@
 #pragma once
 #include "host_tensor.hpp"

-template <>
-void host_gemm<ushort, ushort, ushort>(const Tensor<ushort>& a,
-                                       const Tensor<ushort>& b,
-                                       Tensor<ushort>& c,
-                                       const GemmMatrixLayout layout)
-{
-    if(layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        auto f_mk_kn_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[1];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(k, n));
-            }
-
-            c(m, n) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        auto f_mk_nk_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[1];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(n, k));
-            }
-
-            c(m, n) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_KN_MN)
-    {
-        auto f_km_kn_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[0];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(k, n));
-            }
-
-            c(m, n) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_NK_MN)
-    {
-        auto f_km_nk_mn = [&](auto m, auto n) {
-            const int K = a.mDesc.GetLengths()[0];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(n, k));
-            }
-
-            c(m, n) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::MK_KN_NM)
-    {
-        auto f_mk_kn_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[1];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(k, n));
-            }
-
-            c(n, m) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::MK_NK_NM)
-    {
-        auto f_mk_nk_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[1];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(n, k));
-            }
-
-            c(n, m) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_KN_NM)
-    {
-        auto f_km_kn_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[0];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(k, n));
-            }
-
-            c(n, m) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else if(layout == GemmMatrixLayout::KM_NK_NM)
-    {
-        auto f_km_nk_nm = [&](auto n, auto m) {
-            const int K = a.mDesc.GetLengths()[0];
-
-            double v = 0;
-
-            for(int k = 0; k < K; ++k)
-            {
-                v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(n, k));
-            }
-
-            c(n, m) = ck::f32_to_bf16(v);
-        };
-
-        make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
-            std::thread::hardware_concurrency());
-    }
-    else
-    {
-        throw std::runtime_error("wrong! not supported layout");
-    }
-}
-
 template <typename AType, typename BType, typename CType>
 void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
                        const Tensor<BType>& b_k_n,
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -299,53 +299,41 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s

 void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);

+float bf16_to_f32_(ushort src_val);
+
 template <typename T>
 void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 {
    float error     = 0;
    float max_diff  = -1;
    float ref_value = 0, result_value = 0;
-    for(int i = 0; i < ref.mData.size(); ++i)
+
+    if constexpr(std::is_same<ushort, T>::value)
    {
-        error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-        if(max_diff < diff)
+        for(int i = 0; i < ref.mData.size(); ++i)
        {
-            max_diff     = diff;
-            ref_value    = ref.mData[i];
-            result_value = result.mData[i];
+            error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
+            float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
+            if(max_diff < diff)
+            {
+                max_diff     = diff;
+                ref_value    = bf16_to_f32_(ref.mData[i]);
+                result_value = bf16_to_f32_(result.mData[i]);
+            }
        }
    }
-
-    std::cout << "error: " << error << std::endl;
-    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
-}
-
-__host__ __device__ float bf16_to_f32(ushort src_val)
-{
-    union
+    else
    {
-        uint32_t int32;
-        float fp32;
-    } u = {uint32_t(src_val) << 16};
-    return u.fp32;
-}
-
-template <>
-void check_error<ushort>(const Tensor<ushort>& ref, const Tensor<ushort>& result)
-{
-    float error     = 0;
-    float max_diff  = -1;
-    float ref_value = 0, result_value = 0;
-    for(int i = 0; i < ref.mData.size(); ++i)
-    {
-        error += std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
-        float diff = std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
-        if(max_diff < diff)
+        for(int i = 0; i < ref.mData.size(); ++i)
        {
-            max_diff     = diff;
-            ref_value    = bf16_to_f32(ref.mData[i]);
-            result_value = bf16_to_f32(result.mData[i]);
+            error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
+            float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
+            if(max_diff < diff)
+            {
+                max_diff     = diff;
+                ref_value    = ref.mData[i];
+                result_value = result.mData[i];
+            }
        }
    }

--- a/host/host_tensor/include/host_tensor_generator.hpp
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -5,15 +5,25 @@
 #include "config.hpp"
 #include "data_type.hpp"

+template <typename T>
+struct GeneratorTensor_0
+{
+    template <typename... Is>
+    T operator()(Is...)
+    {
+        return T{0};
+    }
+};
+
 template <typename T>
 struct GeneratorTensor_1
 {
    int value = 1;

    template <typename... Is>
-    float operator()(Is...)
+    T operator()(Is...)
    {
-        return value;
+        return ck::type_convert<T>(value);
    }
 };

@@ -25,7 +35,7 @@ struct GeneratorTensor_1<ushort>
    template <typename... Is>
    ushort operator()(Is...)
    {
-        return ck::f32_to_bf16(value);
+        return ck::type_convert<ushort>(value);
    }
 };

@@ -41,17 +51,6 @@ struct GeneratorTensor_1<int8_t>
    }
 };

-struct GeneratorTensor_0
-{
-    int value = 0;
-
-    template <typename... Is>
-    float operator()(Is...)
-    {
-        return value;
-    }
-};
-
 template <typename T>
 struct GeneratorTensor_2
 {
@@ -59,7 +58,7 @@ struct GeneratorTensor_2
    int max_value = 1;

    template <typename... Is>
-    float operator()(Is...)
+    T operator()(Is...)
    {
        return (std::rand() % (max_value - min_value)) + min_value;
    }
@@ -75,7 +74,7 @@ struct GeneratorTensor_2<ushort>
    ushort operator()(Is...)
    {
        float tmp = (std::rand() % (max_value - min_value)) + min_value;
-        return ck::f32_to_bf16(tmp);
+        return ck::type_convert<ushort>(tmp);
    }
 };

@@ -99,7 +98,7 @@ struct GeneratorTensor_3
    T max_value = 1;

    template <typename... Is>
-    float operator()(Is...)
+    T operator()(Is...)
    {
        float tmp = float(std::rand()) / float(RAND_MAX);

@@ -120,7 +119,7 @@ struct GeneratorTensor_3<ushort>

        float fp32_tmp = min_value + tmp * (max_value - min_value);

-        return ck::f32_to_bf16(fp32_tmp);
+        return ck::type_convert<ushort>(fp32_tmp);
    }
 };