mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 19:40:04 +00:00
fixed multiple definition issue of bfp16/fp32 conversion function when building ckProfiler (#51)
* fixed bfloat16 issues
* refactor type_convert
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: 0a66c54e95]
This commit is contained in:
@@ -1,162 +1,6 @@
|
||||
#pragma once
|
||||
#include "host_tensor.hpp"
|
||||
|
||||
template <>
|
||||
void host_gemm<ushort, ushort, ushort>(const Tensor<ushort>& a,
|
||||
const Tensor<ushort>& b,
|
||||
Tensor<ushort>& c,
|
||||
const GemmMatrixLayout layout)
|
||||
{
|
||||
if(layout == GemmMatrixLayout::MK_KN_MN)
|
||||
{
|
||||
auto f_mk_kn_mn = [&](auto m, auto n) {
|
||||
const int K = a.mDesc.GetLengths()[1];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(k, n));
|
||||
}
|
||||
|
||||
c(m, n) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == GemmMatrixLayout::MK_NK_MN)
|
||||
{
|
||||
auto f_mk_nk_mn = [&](auto m, auto n) {
|
||||
const int K = a.mDesc.GetLengths()[1];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(n, k));
|
||||
}
|
||||
|
||||
c(m, n) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == GemmMatrixLayout::KM_KN_MN)
|
||||
{
|
||||
auto f_km_kn_mn = [&](auto m, auto n) {
|
||||
const int K = a.mDesc.GetLengths()[0];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(k, n));
|
||||
}
|
||||
|
||||
c(m, n) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == GemmMatrixLayout::KM_NK_MN)
|
||||
{
|
||||
auto f_km_nk_mn = [&](auto m, auto n) {
|
||||
const int K = a.mDesc.GetLengths()[0];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(n, k));
|
||||
}
|
||||
|
||||
c(m, n) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == GemmMatrixLayout::MK_KN_NM)
|
||||
{
|
||||
auto f_mk_kn_nm = [&](auto n, auto m) {
|
||||
const int K = a.mDesc.GetLengths()[1];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(k, n));
|
||||
}
|
||||
|
||||
c(n, m) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == GemmMatrixLayout::MK_NK_NM)
|
||||
{
|
||||
auto f_mk_nk_nm = [&](auto n, auto m) {
|
||||
const int K = a.mDesc.GetLengths()[1];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(n, k));
|
||||
}
|
||||
|
||||
c(n, m) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == GemmMatrixLayout::KM_KN_NM)
|
||||
{
|
||||
auto f_km_kn_nm = [&](auto n, auto m) {
|
||||
const int K = a.mDesc.GetLengths()[0];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(k, n));
|
||||
}
|
||||
|
||||
c(n, m) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else if(layout == GemmMatrixLayout::KM_NK_NM)
|
||||
{
|
||||
auto f_km_nk_nm = [&](auto n, auto m) {
|
||||
const int K = a.mDesc.GetLengths()[0];
|
||||
|
||||
double v = 0;
|
||||
|
||||
for(int k = 0; k < K; ++k)
|
||||
{
|
||||
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(n, k));
|
||||
}
|
||||
|
||||
c(n, m) = ck::f32_to_bf16(v);
|
||||
};
|
||||
|
||||
make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
|
||||
std::thread::hardware_concurrency());
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("wrong! not supported layout");
|
||||
}
|
||||
}
|
||||
|
||||
template <typename AType, typename BType, typename CType>
|
||||
void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
|
||||
const Tensor<BType>& b_k_n,
|
||||
|
||||
@@ -299,53 +299,41 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s
|
||||
|
||||
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
|
||||
|
||||
float bf16_to_f32_(ushort src_val);
|
||||
|
||||
template <typename T>
|
||||
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
|
||||
{
|
||||
float error = 0;
|
||||
float max_diff = -1;
|
||||
float ref_value = 0, result_value = 0;
|
||||
for(int i = 0; i < ref.mData.size(); ++i)
|
||||
|
||||
if constexpr(std::is_same<ushort, T>::value)
|
||||
{
|
||||
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
||||
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
||||
if(max_diff < diff)
|
||||
for(int i = 0; i < ref.mData.size(); ++i)
|
||||
{
|
||||
max_diff = diff;
|
||||
ref_value = ref.mData[i];
|
||||
result_value = result.mData[i];
|
||||
error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
|
||||
float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
|
||||
if(max_diff < diff)
|
||||
{
|
||||
max_diff = diff;
|
||||
ref_value = bf16_to_f32_(ref.mData[i]);
|
||||
result_value = bf16_to_f32_(result.mData[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "error: " << error << std::endl;
|
||||
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
|
||||
}
|
||||
|
||||
__host__ __device__ float bf16_to_f32(ushort src_val)
|
||||
{
|
||||
union
|
||||
else
|
||||
{
|
||||
uint32_t int32;
|
||||
float fp32;
|
||||
} u = {uint32_t(src_val) << 16};
|
||||
return u.fp32;
|
||||
}
|
||||
|
||||
template <>
|
||||
void check_error<ushort>(const Tensor<ushort>& ref, const Tensor<ushort>& result)
|
||||
{
|
||||
float error = 0;
|
||||
float max_diff = -1;
|
||||
float ref_value = 0, result_value = 0;
|
||||
for(int i = 0; i < ref.mData.size(); ++i)
|
||||
{
|
||||
error += std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
|
||||
float diff = std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
|
||||
if(max_diff < diff)
|
||||
for(int i = 0; i < ref.mData.size(); ++i)
|
||||
{
|
||||
max_diff = diff;
|
||||
ref_value = bf16_to_f32(ref.mData[i]);
|
||||
result_value = bf16_to_f32(result.mData[i]);
|
||||
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
||||
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
||||
if(max_diff < diff)
|
||||
{
|
||||
max_diff = diff;
|
||||
ref_value = ref.mData[i];
|
||||
result_value = result.mData[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -5,15 +5,25 @@
|
||||
#include "config.hpp"
|
||||
#include "data_type.hpp"
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_0
|
||||
{
|
||||
template <typename... Is>
|
||||
T operator()(Is...)
|
||||
{
|
||||
return T{0};
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_1
|
||||
{
|
||||
int value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
float operator()(Is...)
|
||||
T operator()(Is...)
|
||||
{
|
||||
return value;
|
||||
return ck::type_convert<T>(value);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -25,7 +35,7 @@ struct GeneratorTensor_1<ushort>
|
||||
template <typename... Is>
|
||||
ushort operator()(Is...)
|
||||
{
|
||||
return ck::f32_to_bf16(value);
|
||||
return ck::type_convert<ushort>(value);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -41,17 +51,6 @@ struct GeneratorTensor_1<int8_t>
|
||||
}
|
||||
};
|
||||
|
||||
struct GeneratorTensor_0
|
||||
{
|
||||
int value = 0;
|
||||
|
||||
template <typename... Is>
|
||||
float operator()(Is...)
|
||||
{
|
||||
return value;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct GeneratorTensor_2
|
||||
{
|
||||
@@ -59,7 +58,7 @@ struct GeneratorTensor_2
|
||||
int max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
float operator()(Is...)
|
||||
T operator()(Is...)
|
||||
{
|
||||
return (std::rand() % (max_value - min_value)) + min_value;
|
||||
}
|
||||
@@ -75,7 +74,7 @@ struct GeneratorTensor_2<ushort>
|
||||
ushort operator()(Is...)
|
||||
{
|
||||
float tmp = (std::rand() % (max_value - min_value)) + min_value;
|
||||
return ck::f32_to_bf16(tmp);
|
||||
return ck::type_convert<ushort>(tmp);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -99,7 +98,7 @@ struct GeneratorTensor_3
|
||||
T max_value = 1;
|
||||
|
||||
template <typename... Is>
|
||||
float operator()(Is...)
|
||||
T operator()(Is...)
|
||||
{
|
||||
float tmp = float(std::rand()) / float(RAND_MAX);
|
||||
|
||||
@@ -120,7 +119,7 @@ struct GeneratorTensor_3<ushort>
|
||||
|
||||
float fp32_tmp = min_value + tmp * (max_value - min_value);
|
||||
|
||||
return ck::f32_to_bf16(fp32_tmp);
|
||||
return ck::type_convert<ushort>(fp32_tmp);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user