Bf16*fp4 gemm (#2801)

* support bf16*mxfp4 gemm

* rebase bf16*fp4 example to develop branch

* Clean up commented debug code in GEMM kernel

* rename example folder

* support bf16*mxfp4 gemm

* rebase bf16*fp4 example to develop branch

* Clean up commented debug code in GEMM kernel

* rename example folder

* rebase to new develop

* fix clang format

* update code according to reviewer's comment

* Update README.md

* update code according to reviewer's comment

* update code according to reviewer's comment

* Update CMakeLists.txt

* Update README.md

* Update CMakeLists.txt

* Delete files

* Delete files

* Add unit tests

* Update test_gemm_quant_base.hpp

* merge bf16*fp4 example to develop branch

* fix clang format

* fix clang format

* Update CMakeLists.txt

* fix ci test

* fix clang format

* resolve conflicts

---------

Co-authored-by: eliotwang <charyang@smci355-ccs-aus-m10-29.cs-aus.dcgpu>
Co-authored-by: ShaoChunLee <Shao-Chun.Lee@amd.com>
Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com>
Co-authored-by: Thomas Ning <Thomas.Ning@amd.com>
This commit is contained in:
eliotwang
2025-12-11 23:20:29 +08:00
committed by GitHub
parent ce99cab605
commit 715671e419
23 changed files with 1260 additions and 137 deletions

View File

@@ -52,9 +52,19 @@ template <typename ComputeDataType, typename OutDataType, typename AccDataType =
CK_TILE_HOST double get_relative_threshold(const int number_of_accumulations = 1)
{
static_assert(
is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
static_assert(is_any_of<ComputeDataType,
F8,
BF8,
F16,
BF16,
F32,
pk_fp4_t,
pk_fp4_raw_t,
pk_int4_t,
I8,
I32,
int>::value,
"Warning: Unhandled ComputeDataType for setting up the relative threshold!");
double compute_error = 0;
if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
@@ -113,9 +123,19 @@ CK_TILE_HOST double get_absolute_threshold(const double max_possible_num,
const int number_of_accumulations = 1)
{
static_assert(
is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
static_assert(is_any_of<ComputeDataType,
F8,
BF8,
F16,
BF16,
F32,
pk_fp4_t,
pk_fp4_raw_t,
pk_int4_t,
I8,
I32,
int>::value,
"Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
auto expo = std::log2(std::abs(max_possible_num));
double compute_error = 0;

View File

@@ -246,6 +246,63 @@ CK_TILE_HOST void reference_gemm_tensor_quant(const HostTensor<ADataType>& a_m_k
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
}
template <typename ADataType,
typename QDataType,
typename BDataType,
typename AccDataType,
typename CDataType,
typename QuantGroupSize,
bool aquant,
typename AElementOp = ck_tile::identity,
typename BElementOp = ck_tile::identity,
typename ACCElementOp = ck_tile::identity>
CK_TILE_HOST void reference_mxfp4gemm_quant(const HostTensor<ADataType>& a_m_k,
const HostTensor<QDataType>& q,
const HostTensor<BDataType>& b_k_n,
HostTensor<CDataType>& c_m_n,
const AElementOp& a_element_op = {},
const BElementOp& b_element_op = {},
const ACCElementOp& acc_element_op = {})
{
const std::size_t M = a_m_k.get_length(0);
const std::size_t N = b_k_n.get_length(1);
const std::size_t K = a_m_k.get_length(1);
auto f_mn = [&](auto m, auto n) {
AccDataType v_acc = 0;
AccDataType pasual = 0;
for(std::size_t k = 0; k < (K / 2); k++)
{
using ComputeType = float;
auto b_scale = type_convert<int32_t>(q((2 * k) / QuantGroupSize::kK, n)) - 127;
ComputeType v_a_0, v_a_1;
ComputeType v_b_0, v_b_1;
v_a_0 = ck_tile::type_convert<ComputeType>((a_element_op(a_m_k(m, 2 * k))));
v_a_1 = ck_tile::type_convert<ComputeType>((a_element_op(a_m_k(m, 2 * k + 1))));
if constexpr(std::is_same_v<BDataType, pk_fp4_raw_t>)
{
auto b_pack = type_convert<pk_fp4_t>(b_element_op(b_k_n(k, n)));
auto b_scale_fp4 = type_convert<float>(std::pow(2.0f, b_scale));
auto b_f4_lo = type_convert<pk_fp4_t>(b_pack.unpack(number<0>{}));
auto b_f4_hi = type_convert<pk_fp4_t>(b_pack.unpack(number<1>{}));
v_b_0 = type_convert<ComputeType>(b_f4_lo) * b_scale_fp4;
v_b_1 = type_convert<ComputeType>(b_f4_hi) * b_scale_fp4;
}
pasual = v_a_0 * v_b_0 + v_a_1 * v_b_1;
v_acc += pasual;
}
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
};
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
std::cout << std::endl;
}
template <typename ADataType,
typename BDataType,
typename AccDataType,