Bf16*fp4 gemm (#2801)

* support bf16*mxfp4 gemm

* rebase bf16*fp4 example to develop branch

* Clean up commented debug code in GEMM kernel

* rename example folder

* support bf16*mxfp4 gemm

* rebase bf16*fp4 example to develop branch

* Clean up commented debug code in GEMM kernel

* rename example folder

* rebase to new develop

* fix clang format

* update code according to reviewer's comment

* Update README.md

* update code according to reviewer's comment

* update code according to reviewer's comment

* Update CMakeLists.txt

* Update README.md

* Update CMakeLists.txt

* Delete files

* Delete files

* Add unit tests

* Update test_gemm_quant_base.hpp

* merge bf16*fp4 example to develop branch

* fix clang format

* fix clang format

* Update CMakeLists.txt

* fix ci test

* fix clang format

* resolve conflicts

---------

Co-authored-by: eliotwang <charyang@smci355-ccs-aus-m10-29.cs-aus.dcgpu>
Co-authored-by: ShaoChunLee <Shao-Chun.Lee@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
This commit is contained in:
eliotwang
2025-12-11 23:20:29 +08:00
committed by GitHub
parent ce99cab605
commit 715671e419
23 changed files with 1260 additions and 137 deletions

View File

@@ -52,9 +52,19 @@ template <typename ComputeDataType, typename OutDataType, typename AccDataType =
CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1)
{
static_assert(
is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
static_assert(is_any_of<ComputeDataType,
F8,
BF8,
F16,
BF16,
F32,
pk_fp4_t,
pk_fp4_raw_t,
pk_int4_t,
I8,
I32,
int>::value,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
double compute_error = 0;
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
@@ -113,9 +123,19 @@ CK_TILE_HOST double get_absolute_threshold(const double max_possible_num,
const int number_of_accumulations = 1)
{
static_assert(
is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
static_assert(is_any_of<ComputeDataType,
F8,
BF8,
F16,
BF16,
F32,
pk_fp4_t,
pk_fp4_raw_t,
pk_int4_t,
I8,
I32,
int>::value,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
auto expo = std::log2(std::abs(max_possible_num));
double compute_error = 0;