Gemm+Reduce Fusion (#128)

* add gridwise gemm v4r1

* rename

* adding gemm+reduce

* adding gemm+reduce

* adding gemm+reduce

* adding gemm+reduce

* use sfc in shuffling

* remove hardcode

* remove hardcode

* refactor

* fix build

* adding gemm+reduce

* adding gemm+reduce

* adding gemm+reduce

* adding gemm+reduce

* adding gemm+reduce

* format

* clean

* adding gemm+reduce

* adding profiler for gemm+reduce

* adding gemm+reduce profiler

* fix build

* clean up

* gemm+reduce

* fix build

* update DeviceGemm_Xdl_CShuffle; update enum to enum class

* clean up

* add test for gemm+reduce

* clean up

* refactor

* fix build

* fix build

[ROCm/composable_kernel commit: f95267f166]
This commit is contained in:
Chao Liu
2022-03-23 22:18:42 -05:00
committed by GitHub
parent 2f3f406393
commit 8cba08d07a
56 changed files with 4429 additions and 297 deletions

View File

@@ -64,6 +64,8 @@ void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream
os << "}" << std::endl;
}
#if 1
// FIXME: remove
float bf16_to_f32_(ck::bhalf_t src_val)
{
union
@@ -74,8 +76,10 @@ float bf16_to_f32_(ck::bhalf_t src_val)
return u.fp32;
}
// FIXME: remove
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst)
{
for(int i = 0; i < src.mData.size(); ++i)
dst.mData[i] = bf16_to_f32_(src.mData[i]);
}
#endif