fixed multiple definition issue of bfp16/fp32 conversion function when building ckProfiler (#51)

* fixed bfloat16 issues

* refactor type_convert

Co-authored-by: Chao Liu <chao.liu2@amd.com>

[ROCm/composable_kernel commit: 0a66c54e95]
This commit is contained in:
zjing14
2021-11-16 15:44:17 -06:00
committed by GitHub
parent ea6fa92eea
commit 1e7102575b
18 changed files with 157 additions and 337 deletions

View File

@@ -1,162 +1,6 @@
#pragma once
#include "host_tensor.hpp"
template <>
void host_gemm<ushort, ushort, ushort>(const Tensor<ushort>& a,
const Tensor<ushort>& b,
Tensor<ushort>& c,
const GemmMatrixLayout layout)
{
if(layout == GemmMatrixLayout::MK_KN_MN)
{
auto f_mk_kn_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(k, n));
}
c(m, n) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_mk_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::MK_NK_MN)
{
auto f_mk_nk_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(n, k));
}
c(m, n) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_mk_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_KN_MN)
{
auto f_km_kn_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(k, n));
}
c(m, n) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_km_kn_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_NK_MN)
{
auto f_km_nk_mn = [&](auto m, auto n) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(n, k));
}
c(m, n) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_km_nk_mn, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::MK_KN_NM)
{
auto f_mk_kn_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(k, n));
}
c(n, m) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_mk_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::MK_NK_NM)
{
auto f_mk_nk_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[1];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(m, k)) * ck::bf16_to_f32(b(n, k));
}
c(n, m) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_mk_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_KN_NM)
{
auto f_km_kn_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(k, n));
}
c(n, m) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_km_kn_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else if(layout == GemmMatrixLayout::KM_NK_NM)
{
auto f_km_nk_nm = [&](auto n, auto m) {
const int K = a.mDesc.GetLengths()[0];
double v = 0;
for(int k = 0; k < K; ++k)
{
v += ck::bf16_to_f32(a(k, m)) * ck::bf16_to_f32(b(n, k));
}
c(n, m) = ck::f32_to_bf16(v);
};
make_ParallelTensorFunctor(f_km_nk_nm, c.mDesc.GetLengths()[0], c.mDesc.GetLengths()[1])(
std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("wrong! not supported layout");
}
}
template <typename AType, typename BType, typename CType>
void host_gemm_mk_kn_mn(const Tensor<AType>& a_m_k,
const Tensor<BType>& b_k_n,

View File

@@ -299,53 +299,41 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
float bf16_to_f32_(ushort src_val);
template <typename T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
if constexpr(std::is_same<ushort, T>::value)
{
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
for(int i = 0; i < ref.mData.size(); ++i)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = bf16_to_f32_(ref.mData[i]);
result_value = bf16_to_f32_(result.mData[i]);
}
}
}
std::cout << "error: " << error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
__host__ __device__ float bf16_to_f32(ushort src_val)
{
union
else
{
uint32_t int32;
float fp32;
} u = {uint32_t(src_val) << 16};
return u.fp32;
}
template <>
void check_error<ushort>(const Tensor<ushort>& ref, const Tensor<ushort>& result)
{
float error = 0;
float max_diff = -1;
float ref_value = 0, result_value = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
error += std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
float diff = std::abs(bf16_to_f32(ref.mData[i]) - bf16_to_f32(result.mData[i]));
if(max_diff < diff)
for(int i = 0; i < ref.mData.size(); ++i)
{
max_diff = diff;
ref_value = bf16_to_f32(ref.mData[i]);
result_value = bf16_to_f32(result.mData[i]);
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
if(max_diff < diff)
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
}
}

View File

@@ -5,15 +5,25 @@
#include "config.hpp"
#include "data_type.hpp"
template <typename T>
struct GeneratorTensor_0
{
template <typename... Is>
T operator()(Is...)
{
return T{0};
}
};
template <typename T>
struct GeneratorTensor_1
{
int value = 1;
template <typename... Is>
float operator()(Is...)
T operator()(Is...)
{
return value;
return ck::type_convert<T>(value);
}
};
@@ -25,7 +35,7 @@ struct GeneratorTensor_1<ushort>
template <typename... Is>
ushort operator()(Is...)
{
return ck::f32_to_bf16(value);
return ck::type_convert<ushort>(value);
}
};
@@ -41,17 +51,6 @@ struct GeneratorTensor_1<int8_t>
}
};
struct GeneratorTensor_0
{
int value = 0;
template <typename... Is>
float operator()(Is...)
{
return value;
}
};
template <typename T>
struct GeneratorTensor_2
{
@@ -59,7 +58,7 @@ struct GeneratorTensor_2
int max_value = 1;
template <typename... Is>
float operator()(Is...)
T operator()(Is...)
{
return (std::rand() % (max_value - min_value)) + min_value;
}
@@ -75,7 +74,7 @@ struct GeneratorTensor_2<ushort>
ushort operator()(Is...)
{
float tmp = (std::rand() % (max_value - min_value)) + min_value;
return ck::f32_to_bf16(tmp);
return ck::type_convert<ushort>(tmp);
}
};
@@ -99,7 +98,7 @@ struct GeneratorTensor_3
T max_value = 1;
template <typename... Is>
float operator()(Is...)
T operator()(Is...)
{
float tmp = float(std::rand()) / float(RAND_MAX);
@@ -120,7 +119,7 @@ struct GeneratorTensor_3<ushort>
float fp32_tmp = min_value + tmp * (max_value - min_value);
return ck::f32_to_bf16(fp32_tmp);
return ck::type_convert<ushort>(fp32_tmp);
}
};