mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 14:29:05 +00:00
* Squashed commit of the following: commit 3e1a851dad834776efbe4fe365ac82c4ed312010 Author: Ding, Yi <yi.ding@amd.com> Date: Thu Oct 23 06:10:54 2025 +0000 Fix & clean after rebase commit 1edf485092f44411da9a1796a4a6b72d5cdb67c6 Author: Ding, Yi <yi.ding@amd.com> Date: Wed Oct 22 10:46:13 2025 +0000 Squashed commit of the following: commit0b6b9dbd1bAuthor: mtgu0705 <mtgu@amd.com> Date: Mon Sep 22 02:04:27 2025 -0500 fix bandwidth calculation commit9aebf53bb7Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 22 00:58:59 2025 -0500 updates commit62607de56cAuthor: mtgu0705 <mtgu@amd.com> Date: Fri Sep 19 00:39:46 2025 -0500 fix a bug, set the A DS_read preload size to 4 for MXFP4 commit92ad6fcc0aAuthor: mtgu0705 <mtgu@amd.com> Date: Thu Sep 18 01:19:03 2025 -0500 fix a_wrap preload issue for large MPerBlock. commitf2db44710fAuthor: mtgu0705 <mtgu@amd.com> Date: Wed Sep 17 21:34:03 2025 -0500 optimized the VGPR repack issue for MXFP4 commit346a400027Author: Gino Lu <gino.lu@amd.com> Date: Wed Sep 17 04:19:44 2025 -0500 fix time error commit80c1743034Author: mtgu0705 <mtgu@amd.com> Date: Wed Sep 17 03:58:00 2025 -0500 updated, function passed. commitce26d9071eAuthor: mtgu0705 <mtgu@amd.com> Date: Tue Sep 16 22:21:39 2025 -0500 fix, function partially passed commit0a89ed13a5Author: mtgu0705 <mtgu@amd.com> Date: Tue Sep 16 03:01:12 2025 -0500 fix, reference function passed, next check kernel function commitec9bcef591Author: Gino Lu <gino.lu@amd.com> Date: Tue Sep 16 02:29:01 2025 -0500 let pack/unpack return pk_fp4_t commita333206929Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 20:50:26 2025 -0500 fix commit3893c06540Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 15 05:51:06 2025 -0500 fix bug commit8052bea019Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 04:02:05 2025 -0500 fix core dump issue, function is not correct. commit9ceb3fd508Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 03:03:02 2025 -0500 updates, build pass commitcc94eb6045Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 00:05:18 2025 -0500 updates commit22586c3135Author: Gino Lu <gino.lu@amd.com> Date: Sun Sep 14 23:40:28 2025 -0500 fix bug commite92e67b8ddAuthor: Gino Lu <gino.lu@amd.com> Date: Fri Sep 12 03:28:50 2025 -0500 fix interface commit8b1dd60c08Author: Gino Lu <gino.lu@amd.com> Date: Fri Sep 12 02:53:50 2025 -0500 add interface in warp_gemm_impl commitc6135f6abeAuthor: mtgu0705 <mtgu@amd.com> Date: Wed Sep 10 05:03:08 2025 -0500 updates some fixes. commitb0d71b8d19Author: mtgu0705 <mtgu@amd.com> Date: Tue Sep 9 04:37:42 2025 -0500 fix after merge ginolu/add_wgmfma_dispatcher commitf119c30317Merge:c5030e60272c8ef856Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 8 22:09:15 2025 -0500 Merge remote-tracking branch 'origin/ginolu/add_wgmfma_dispatcher' into mtgu/cktile_mxfp4_flatmm_dev commitc5030e602eAuthor: mtgu0705 <mtgu@amd.com> Date: Mon Sep 8 21:42:47 2025 -0500 update mx flatmm tail pipeline commit72c8ef8567Merge:9661bb400e4a772890Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 8 19:10:23 2025 -0500 Merge branch 'develop' into ginolu/add_wgmfma_dispatcher commit9661bb400bAuthor: Gino Lu <gino.lu@amd.com> Date: Mon Sep 8 19:09:55 2025 -0500 fix type error commit0509597f55Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 8 04:01:40 2025 -0500 update hotloop pipeline commit754ae0461bMerge:15d44406e83f607e2aAuthor: Gino Lu <gino.lu@amd.com> Date: Fri Sep 5 04:22:26 2025 -0500 Merge branch 'develop' into ginolu/add_wgmfma_dispatcher commit15d44406e5Author: Gino Lu <gino.lu@amd.com> Date: Fri Sep 5 04:21:26 2025 -0500 fix clang format commit146963d62aAuthor: mtgu0705 <mtgu@amd.com> Date: Wed Sep 3 10:00:54 2025 -0500 some updates commit12526b626aMerge:47cee047100fd72b2dAuthor: asleepzzz <hanwen.chang@amd.com> Date: Wed Sep 3 13:22:03 2025 +0800 Merge branch 'develop' into ginolu/add_wgmfma_dispatcher commit47cee04712Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 1 02:11:02 2025 -0500 fix vec size error commitd2892925e5Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 1 01:23:39 2025 -0500 fix format error commit16993acd1dAuthor: mtgu0705 <mtgu@amd.com> Date: Sat Aug 30 03:19:07 2025 -0500 update codes commit9c37e55d13Author: mtgu0705 <mtgu@amd.com> Date: Fri Aug 29 11:27:33 2025 -0500 init ck_tile mxfp4 flatmm commit5c484a5672Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 28 08:02:50 2025 +0000 Add bias for f16xf4 moe_flatmm commitdd6539f366Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 27 13:39:47 2025 +0000 update case construction commit65b702454cAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Tue Aug 26 12:32:29 2025 +0000 support swiglu activaion and use rcpf to accelerate silu commitb422e41e08Author: Gino Lu <gino.lu@amd.com> Date: Tue Aug 26 02:33:55 2025 -0500 first commit commitd05eed931dAuthor: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu> Date: Fri Aug 22 04:01:59 2025 -0500 add line to last commitd69cab7f0cAuthor: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu> Date: Fri Aug 22 03:20:46 2025 -0500 adjust A_LDS descriptor to avoid bankconflict commit65989e940cAuthor: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu> Date: Thu Aug 21 09:46:52 2025 -0500 enable hotloop commitc378e9bdf8Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 21 09:12:21 2025 +0000 support atomic_pk_add_bf16 on gfx950 commit85976b0b87Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 21 06:58:55 2025 +0000 use int64_t as expert stride to avoid overflow commit9fbcc8f8a4Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 20 13:53:32 2025 +0000 use v4i32 as the storage type for B to avoid repack operation commit81899bd920Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 20 06:40:03 2025 +0000 add pk_fp4_t and e8m0_t support for amd_buffer_load_impl commitc27eb0771aAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 20 04:39:14 2025 +0000 optimize cvt_pkf4_to_f16 implementation commit3ca0bd500aAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Tue Aug 19 14:56:46 2025 +0000 optimize A_LDS descriptor to avoid bankconflict commitf7f0306eeaAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 18 18:43:37 2025 +0000 fix gate-up when GU_NRepeat > 1 commitbe55c0f9cbAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 18 17:28:11 2025 +0000 add fp16xf4 moe commit599e1f5b32Author: Feng Shijie <Shijie.Feng@amd.com> Date: Sun Aug 17 17:51:18 2025 +0000 rename example commit7899fb4a8dAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 15 06:20:46 2025 +0000 remove additional check when e8m0->float commit714b341797Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 14 09:34:12 2025 +0000 eliminate repeat dequant commit53e8c0c533Merge:5de620895cc9c7b9e5Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 13 16:51:49 2025 +0000 Merge remote-tracking branch 'origin/moe_flatmm' into feat-mixed_input_flatmm commit5de6208952Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 13 16:16:48 2025 +0000 update f16xMXF4 commit732ebdee8bAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 13 10:48:53 2025 +0000 update scale-preshuffle for MXF4 commitedb58d0680Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 11:24:34 2025 +0000 update commitcc9c7b9e58Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 08:38:23 2025 +0000 optimize gemm2 atomic_add pattern commit200a11afc8Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 07:59:47 2025 +0000 update scale for mxfp4 commit87aed564dcAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 07:56:14 2025 +0000 update case construction commit8b85fa6cf2Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 06:03:06 2025 +0000 update granularity control commit1b8c7097b8Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 03:42:46 2025 +0000 fix TileConfig commit8ba1c708dcAuthor: Gino Lu <gino.lu@amd.com> Date: Thu Aug 7 21:37:28 2025 +0800 Add e8m0 scaled convert into CK_TILE (#2617) * first commit * remove redundent code * modify according to comments. * fix type_convert error with scaled_type_convert commitf788d3d629Author: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 8 20:19:16 2025 +0000 add mixed_prec fp16xfp4 commit3dea10a277Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 7 09:22:04 2025 +0000 debug mixed_prec flatmm commit0ba513b148Merge:90e910f3ac0cb4d036Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Aug 6 16:49:47 2025 +0800 Merge pull request #2626 from ROCm/felix/flatmm_fix_splitk fix split k commit6d3cbc7c0eAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 6 08:33:33 2025 +0000 add moe_flatmm commitc0cb4d036dAuthor: coderfeli <coderfeli@163.com> Date: Wed Aug 6 02:45:31 2025 +0000 fix split k commit90e910f3a7Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 4 07:16:36 2025 +0000 fix flatmm with scaling when WarpTileM == 32 commitaa5e008fa5Author: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 1 11:01:23 2025 +0000 optimize scaling epilogue commitac5908c0bbAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 1 07:28:38 2025 +0000 fix wrong config for fp8 scaling commit3f43b841d4Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 30 06:20:30 2025 +0000 prune debug message commit2e5d4c74cdAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 30 04:52:08 2025 +0000 fix compile error commitc117a1986aAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Tue Jul 29 15:42:58 2025 +0000 Add persistent option on flatmm for tuning commita587701117Author: AMD-dteng <dteng@amd.com> Date: Tue Jul 29 22:48:00 2025 +0800 update pipeline v1: add atomic IGLP schedule commitf9e48148d2Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 24 09:09:27 2025 +0000 fix error log throwing commit1b6d7cf407Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Jul 28 08:24:51 2025 +0000 crz idea commit5473f06461Author: Feng Shijie <Shijie.Feng@amd.com> Date: Sun Jul 27 11:57:38 2025 +0000 Add permuteN optimzization when NRepeat % 2 == 0 on flatmm commitbfb9f4002fAuthor: sjfeng <j514681085@icloud.com> Date: Sun Jul 27 17:24:08 2025 +0800 try to remove c_shuffle_lds commit1264f4d2abAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Jul 25 07:41:48 2025 +0000 fix loop-dim mismatch and improve c_shuffle alu parallelism commit1239d8a546Merge:406645448b908f5e80Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 24 08:46:51 2025 +0000 merge flatmm -scale commit4066454483Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 24 16:19:58 2025 +0800 revert delete of inc file commit68390988c9Author: solin <bingzhou@amd.com> Date: Thu Jul 24 04:38:16 2025 +0000 reorg flatmm code commitb908f5e803Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 23 19:12:31 2025 +0000 fix flatmm syntax error on gfx950 commit5a1183ebbdAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 23 19:04:22 2025 +0000 support flatmm scaling commit89fa639207Author: valarLip <340077269@qq.com> Date: Wed Jul 23 08:44:12 2025 +0000 merge flatmm pipe v0 from dteng_flatmm_opt commit3f7d848dd3Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 23 15:38:12 2025 +0800 build pass commit6dacf833daAuthor: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 23 07:20:26 2025 +0000 fix bug commit7e1bd4b839Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 23 15:01:53 2025 +0800 sync commit46a538e39eAuthor: valarLip <340077269@qq.com> Date: Tue Jul 22 08:09:35 2025 +0000 adaptive scheduler instead of Macro definition commit9aa3396a79Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 17 08:40:35 2025 +0000 fix tail handler bug commitfb76450e63Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 16 10:12:19 2025 +0000 merge from dteng_flatmm_opt --------- Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com> Co-authored-by: AMD-dteng <dteng@amd.com> Co-authored-by: solin <bingzhou@amd.com> Co-authored-by: sjfeng <j514681085@icloud.com> Co-authored-by: valarLip <340077269@qq.com> Co-authored-by: asleepzzz <hanwen.chang@amd.com> Co-authored-by: Feng Shijie <Shijie.Feng@amd.com> Co-authored-by: coderfeli <coderfeli@163.com> Co-authored-by: Gino Lu <gino.lu@amd.com> Co-authored-by: mtgu0705 <mtgu@amd.com> * Fix crash on small M * Apply suggestion from @Copilot --------- Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com> Co-authored-by: AMD-dteng <dteng@amd.com> Co-authored-by: solin <bingzhou@amd.com> Co-authored-by: sjfeng <j514681085@icloud.com> Co-authored-by: valarLip <340077269@qq.com> Co-authored-by: asleepzzz <hanwen.chang@amd.com> Co-authored-by: Feng Shijie <Shijie.Feng@amd.com> Co-authored-by: coderfeli <coderfeli@163.com> Co-authored-by: Gino Lu <gino.lu@amd.com> Co-authored-by: mtgu0705 <mtgu@amd.com>
839 lines
33 KiB
C++
839 lines
33 KiB
C++
// SPDX-License-Identifier: MIT
|
|
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
|
|
|
#pragma once
|
|
|
|
#include <cstdlib>
|
|
#include <thread>
|
|
|
|
#include "ck_tile/core.hpp"
|
|
#include "ck_tile/host/host_tensor.hpp"
|
|
|
|
namespace ck_tile {
|
|
|
|
template <typename ADataType,
|
|
typename QDataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
uint32_t QuantGroupSize,
|
|
bool aquant,
|
|
typename AElementOp = ck_tile::identity,
|
|
typename BElementOp = ck_tile::identity,
|
|
typename ACCElementOp = ck_tile::identity>
|
|
CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
|
|
const HostTensor<QDataType>& q,
|
|
const HostTensor<BDataType>& b_k_n,
|
|
HostTensor<CDataType>& c_m_n,
|
|
const AElementOp& a_element_op = {},
|
|
const BElementOp& b_element_op = {},
|
|
const ACCElementOp& acc_element_op = {})
|
|
{
|
|
const std::size_t M = a_m_k.get_length(0);
|
|
const std::size_t N = b_k_n.get_length(1);
|
|
const std::size_t K = a_m_k.get_length(1);
|
|
|
|
auto f_mn = [&](auto m, auto n) {
|
|
AccDataType v_acc = 0, v_block_acc = 0;
|
|
|
|
static_assert(std::is_same_v<ADataType, pk_int4_t> || std::is_same_v<ADataType, fp8_t> ||
|
|
std::is_same_v<ADataType, bf8_t>);
|
|
static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t> ||
|
|
std::is_same_v<BDataType, pk_int4_t>);
|
|
static_assert(std::is_same_v<AccDataType, float>);
|
|
static_assert(std::is_same_v<CDataType, float> ||
|
|
std::is_same_v<CDataType, ck_tile::half_t>);
|
|
for(std::size_t k = 0; k < K; ++k)
|
|
{
|
|
AccDataType v_a;
|
|
AccDataType v_b;
|
|
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
|
{
|
|
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
|
if(k % 2 == 1)
|
|
v_a = fp32_val.hi;
|
|
else
|
|
v_a = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
|
|
}
|
|
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
|
{
|
|
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
|
if(k % 2 == 1)
|
|
v_b = fp32_val.hi;
|
|
else
|
|
v_b = fp32_val.lo;
|
|
}
|
|
else if constexpr(std::is_same_v<BDataType, fp8_t>)
|
|
{
|
|
v_b = fp8_to_float_raw(b_element_op(b_k_n(k, n)));
|
|
}
|
|
else
|
|
{
|
|
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
|
|
}
|
|
v_block_acc += v_a * v_b;
|
|
|
|
// Apply group dequant scale
|
|
if((k + 1) % QuantGroupSize == 0)
|
|
{
|
|
float scale = 0.f;
|
|
index_t outer_dim = (aquant) ? m : k / QuantGroupSize;
|
|
index_t inner_dim = (aquant) ? k / QuantGroupSize : n;
|
|
|
|
if constexpr(std::is_same_v<QDataType, float>)
|
|
{
|
|
scale = q(outer_dim, inner_dim);
|
|
}
|
|
else if constexpr(std::is_same_v<QDataType, ck_tile::fp8_t>)
|
|
{
|
|
scale = fp8_to_float_raw(q(outer_dim, inner_dim));
|
|
}
|
|
else if constexpr(std::is_same_v<QDataType, ck_tile::bf8_t>)
|
|
{
|
|
scale = bf8_to_float_raw(q(outer_dim, inner_dim));
|
|
}
|
|
else
|
|
{
|
|
static_assert(false, "Unexpected Q datatype.");
|
|
}
|
|
v_block_acc *= scale;
|
|
v_acc += v_block_acc;
|
|
v_block_acc = 0;
|
|
}
|
|
}
|
|
|
|
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
|
};
|
|
|
|
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
|
|
std::cout << std::endl;
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename AQDataType,
|
|
typename BDataType,
|
|
typename BQDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename AElementOp = ck_tile::identity,
|
|
typename BElementOp = ck_tile::identity,
|
|
typename ACCElementOp = ck_tile::identity>
|
|
CK_TILE_HOST void reference_gemm_rowcol_quant(const HostTensor<ADataType>& a_m_k,
|
|
const HostTensor<AQDataType>& aq_m_1,
|
|
const HostTensor<BDataType>& b_k_n,
|
|
const HostTensor<BQDataType>& bq_1_n,
|
|
HostTensor<CDataType>& c_m_n,
|
|
const AElementOp& a_element_op = {},
|
|
const BElementOp& b_element_op = {},
|
|
const ACCElementOp& acc_element_op = {})
|
|
{
|
|
static_assert(std::is_same_v<ADataType, fp8_t> || std::is_same_v<ADataType, bf8_t>);
|
|
static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>);
|
|
static_assert(std::is_same_v<AccDataType, float>);
|
|
static_assert(std::is_same_v<CDataType, float> || std::is_same_v<CDataType, ck_tile::half_t>);
|
|
static_assert(std::is_same_v<AQDataType, float> && std::is_same_v<BQDataType, float>);
|
|
const std::size_t M = a_m_k.get_length(0);
|
|
const std::size_t N = b_k_n.get_length(1);
|
|
const std::size_t K = a_m_k.get_length(1);
|
|
|
|
auto f_mn = [&](auto m, auto n) {
|
|
// Init accumulator
|
|
AccDataType v_acc = 0;
|
|
// Get row scale for A and column scale for B
|
|
float a_scale = aq_m_1(m, 0);
|
|
float b_scale = bq_1_n(0, n);
|
|
|
|
// Compute the dot product
|
|
for(std::size_t k = 0; k < K; ++k)
|
|
{
|
|
AccDataType v_a;
|
|
AccDataType v_b;
|
|
|
|
// Process A data
|
|
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
|
{
|
|
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
|
|
if(k % 2 == 1)
|
|
v_a = fp32_val.hi;
|
|
else
|
|
v_a = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
|
|
}
|
|
|
|
// Process B data
|
|
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
|
{
|
|
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t_signed_conversion(pk_val);
|
|
if(k % 2 == 1)
|
|
v_b = fp32_val.hi;
|
|
else
|
|
v_b = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
|
|
}
|
|
|
|
v_acc += v_a * v_b;
|
|
}
|
|
|
|
v_acc = v_acc * a_scale * b_scale;
|
|
|
|
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
|
};
|
|
|
|
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename AQDataType,
|
|
typename BDataType,
|
|
typename BQDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename AElementOp = ck_tile::identity,
|
|
typename BElementOp = ck_tile::identity,
|
|
typename ACCElementOp = ck_tile::identity>
|
|
CK_TILE_HOST void reference_gemm_tensor_quant(const HostTensor<ADataType>& a_m_k,
|
|
const HostTensor<AQDataType>& aq_1_1,
|
|
const HostTensor<BDataType>& b_k_n,
|
|
const HostTensor<BQDataType>& bq_1_1,
|
|
HostTensor<CDataType>& c_m_n,
|
|
const AElementOp& a_element_op = {},
|
|
const BElementOp& b_element_op = {},
|
|
const ACCElementOp& acc_element_op = {})
|
|
{
|
|
static_assert(std::is_same_v<ADataType, fp8_t> || std::is_same_v<ADataType, bf8_t>);
|
|
static_assert(std::is_same_v<BDataType, fp8_t> || std::is_same_v<BDataType, bf8_t>);
|
|
static_assert(std::is_same_v<AccDataType, float>);
|
|
static_assert(std::is_same_v<CDataType, float> || std::is_same_v<CDataType, ck_tile::half_t>);
|
|
static_assert(std::is_same_v<AQDataType, float> && std::is_same_v<BQDataType, float>);
|
|
const std::size_t M = a_m_k.get_length(0);
|
|
const std::size_t N = b_k_n.get_length(1);
|
|
const std::size_t K = a_m_k.get_length(1);
|
|
|
|
auto f_mn = [&](auto m, auto n) {
|
|
// Init accumulator
|
|
AccDataType v_acc = 0;
|
|
// Get scale for A and scale for B
|
|
const AccDataType a_scale = ck_tile::type_convert<AccDataType>(aq_1_1(0, 0));
|
|
const AccDataType b_scale = ck_tile::type_convert<AccDataType>(bq_1_1(0, 0));
|
|
|
|
// Compute the dot product
|
|
for(std::size_t k = 0; k < K; ++k)
|
|
{
|
|
AccDataType v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
|
|
AccDataType v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
|
|
|
|
v_acc += v_a * v_b;
|
|
}
|
|
|
|
v_acc = v_acc * a_scale * b_scale;
|
|
|
|
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
|
};
|
|
|
|
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename AElementOp = ck_tile::identity,
|
|
typename BElementOp = ck_tile::identity,
|
|
typename ACCElementOp = ck_tile::identity>
|
|
CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
|
|
const HostTensor<BDataType>& b_k_n,
|
|
HostTensor<CDataType>& c_m_n,
|
|
const AElementOp& a_element_op = {},
|
|
const BElementOp& b_element_op = {},
|
|
const ACCElementOp& acc_element_op = {})
|
|
{
|
|
const std::size_t M = a_m_k.get_length(0);
|
|
const std::size_t N = b_k_n.get_length(1);
|
|
const std::size_t K = a_m_k.get_length(1);
|
|
|
|
auto f_mn = [&](auto m, auto n) {
|
|
AccDataType v_acc = 0;
|
|
|
|
for(std::size_t k = 0; k < K; ++k)
|
|
{
|
|
AccDataType v_a;
|
|
AccDataType v_b;
|
|
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
|
{
|
|
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
|
if(k % 2 == 1)
|
|
v_a = fp32_val.hi;
|
|
else
|
|
v_a = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
|
|
}
|
|
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
|
{
|
|
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
|
if(k % 2 == 1)
|
|
v_b = fp32_val.hi;
|
|
else
|
|
v_b = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
|
|
}
|
|
v_acc += v_a * v_b;
|
|
}
|
|
|
|
c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
|
|
};
|
|
|
|
make_ParallelTensorFunctor(f_mn, M, N)(std::thread::hardware_concurrency());
|
|
}
|
|
|
|
template <typename AsDataType,
|
|
typename BsDataType,
|
|
typename DsDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename AElementOp,
|
|
typename BElementOp,
|
|
typename CDElementOp,
|
|
typename ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>,
|
|
typename BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>,
|
|
typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
|
|
CK_TILE_HOST void
|
|
reference_gemm_multiple_abd(const std::array<HostTensor<ADataType>, AsDataType::size()>& as_m_k,
|
|
const std::array<HostTensor<BDataType>, BsDataType::size()>& bs_k_n,
|
|
const std::array<HostTensor<DDataType>, DsDataType::size()>& ds_m_n,
|
|
HostTensor<ADataType>& a_m_k,
|
|
HostTensor<BDataType>& b_k_n,
|
|
HostTensor<CDataType>& c_m_n,
|
|
const AElementOp& a_element_op = {},
|
|
const BElementOp& b_element_op = {},
|
|
const CDElementOp& acc_element_op = {})
|
|
{
|
|
const std::size_t M = a_m_k.get_length(0);
|
|
const std::size_t N = b_k_n.get_length(1);
|
|
const std::size_t K = a_m_k.get_length(1);
|
|
|
|
auto as_m_k_tuple =
|
|
generate_tie([&](auto idx) -> auto& { return as_m_k[idx]; }, number<AsDataType::size()>{});
|
|
|
|
auto bs_k_n_tuple =
|
|
generate_tie([&](auto idx) -> auto& { return bs_k_n[idx]; }, number<BsDataType::size()>{});
|
|
|
|
auto ds_m_n_tuple =
|
|
generate_tie([&](auto idx) -> auto& { return ds_m_n[idx]; }, number<DsDataType::size()>{});
|
|
|
|
// Apply elementwise function to A
|
|
auto a_elementwise_fn = [&](auto i, auto j) {
|
|
ck_tile::apply([&](auto&&... t) { a_element_op(a_m_k(i, j), t(i, j)...); }, as_m_k_tuple);
|
|
};
|
|
|
|
make_ParallelTensorFunctor(a_elementwise_fn, M, K)(std::thread::hardware_concurrency());
|
|
|
|
// Apply elementwise function to B
|
|
auto b_elementwise_fn = [&](auto i, auto j) {
|
|
ck_tile::apply([&](auto&&... t) { b_element_op(b_k_n(i, j), t(i, j)...); }, bs_k_n_tuple);
|
|
};
|
|
|
|
make_ParallelTensorFunctor(b_elementwise_fn, K, N)(std::thread::hardware_concurrency());
|
|
|
|
auto f_mk_kn_mn = [&](auto m, auto n) {
|
|
AccDataType v_acc = 0;
|
|
for(std::size_t k = 0; k < K; ++k)
|
|
{
|
|
ADataType v_a = a_m_k(m, k);
|
|
BDataType v_b = b_k_n(k, n);
|
|
v_acc +=
|
|
ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
|
|
}
|
|
|
|
CDataType v_c = 0;
|
|
|
|
ck_tile::apply(
|
|
[&](auto&&... t) {
|
|
acc_element_op(v_c,
|
|
ck_tile::type_convert<float>(v_acc),
|
|
ck_tile::type_convert<float>(t(m, n))...);
|
|
},
|
|
ds_m_n_tuple);
|
|
|
|
c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
|
|
};
|
|
|
|
make_ParallelTensorFunctor(f_mk_kn_mn, M, N)(std::thread::hardware_concurrency());
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename ScaleDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename AElementOp = ck_tile::identity,
|
|
typename BElementOp = ck_tile::identity,
|
|
typename ACCElementOp = ck_tile::identity>
|
|
CK_TILE_HOST void reference_mx_gemm(const HostTensor<ADataType>& a_m_k,
|
|
const HostTensor<BDataType>& b_k_n,
|
|
HostTensor<CDataType>& c_m_n,
|
|
const HostTensor<ScaleDataType>& scale_a,
|
|
const HostTensor<ScaleDataType>& scale_b,
|
|
const AElementOp& = {},
|
|
const BElementOp& = {},
|
|
const ACCElementOp& = {})
|
|
{
|
|
static_assert(std::is_same_v<AElementOp, ck_tile::identity>);
|
|
static_assert(std::is_same_v<BElementOp, ck_tile::identity>);
|
|
static_assert(std::is_same_v<ACCElementOp, ck_tile::identity>);
|
|
|
|
const std::size_t M = a_m_k.get_length(0);
|
|
const std::size_t N = b_k_n.get_length(1);
|
|
const std::size_t K = a_m_k.get_length(1);
|
|
|
|
const std::size_t ScaleBlockSize = K / scale_a.get_length(1);
|
|
|
|
HostTensor<AccDataType> a_m_k_scaled({std::size_t(M), std::size_t(K)},
|
|
{std::size_t(K), std::size_t(1)});
|
|
HostTensor<AccDataType> b_k_n_scaled({std::size_t(K), std::size_t(N)},
|
|
{std::size_t(1), std::size_t(K)});
|
|
|
|
for(std::size_t m = 0; m < M; ++m)
|
|
{
|
|
for(std::size_t k = 0; k < K; ++k)
|
|
{
|
|
if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
|
|
{
|
|
if(k % 2 == 1)
|
|
continue; // skip odd k
|
|
|
|
auto a_f4x2 = a_m_k(m, k);
|
|
auto a_scale = ck_tile::type_convert<AccDataType>(scale_a(m, k / ScaleBlockSize));
|
|
auto a_f4_lo =
|
|
ck_tile::type_convert<AccDataType>(a_f4x2.template unpack<>(number<0>{}));
|
|
auto a_f4_hi =
|
|
ck_tile::type_convert<AccDataType>(a_f4x2.template unpack<>(number<1>{}));
|
|
|
|
a_m_k_scaled(m, k) = a_f4_lo * a_scale;
|
|
a_m_k_scaled(m, k + 1) = a_f4_hi * a_scale;
|
|
}
|
|
}
|
|
}
|
|
|
|
for(std::size_t n = 0; n < N; n++)
|
|
{
|
|
for(std::size_t k = 0; k < K; k++)
|
|
{
|
|
if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
|
|
{
|
|
if(k % 2 == 1)
|
|
continue; // skip odd k
|
|
|
|
auto b_f4x2 = b_k_n(k, n);
|
|
auto b_scale = ck_tile::type_convert<AccDataType>(scale_b(k / ScaleBlockSize, n));
|
|
auto b_f4_lo =
|
|
ck_tile::type_convert<AccDataType>(b_f4x2.template unpack<>(number<0>{}));
|
|
auto b_f4_hi =
|
|
ck_tile::type_convert<AccDataType>(b_f4x2.template unpack<>(number<1>{}));
|
|
|
|
b_k_n_scaled(k, n) = b_f4_lo * b_scale;
|
|
b_k_n_scaled(k + 1, n) = b_f4_hi * b_scale;
|
|
}
|
|
else
|
|
{
|
|
b_k_n_scaled(k, n) =
|
|
ck_tile::type_convert<AccDataType>((b_k_n(k, n))) *
|
|
ck_tile::type_convert<AccDataType>(scale_b(k / ScaleBlockSize, n));
|
|
}
|
|
}
|
|
}
|
|
|
|
// call reference gemm
|
|
reference_gemm<AccDataType, AccDataType, AccDataType, CDataType>(
|
|
a_m_k_scaled, b_k_n_scaled, c_m_n);
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename DsDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename ACCElementOp,
|
|
typename DDataType = remove_cvref_t<std::tuple_element_t<0, DsDataType>>>
|
|
CK_TILE_HOST void
|
|
reference_gemm_multiple_d(const HostTensor<ADataType>& a_m_k,
|
|
const HostTensor<BDataType>& b_k_n,
|
|
const std::array<HostTensor<DDataType>, DsDataType::size()>& ds_m_n,
|
|
HostTensor<CDataType>& c_m_n,
|
|
const ACCElementOp& acc_element_op = {})
|
|
{
|
|
const std::size_t M = a_m_k.get_length(0);
|
|
const std::size_t N = b_k_n.get_length(1);
|
|
const std::size_t K = a_m_k.get_length(1);
|
|
|
|
auto f_mk_kn_mn = [&](auto m, auto n) {
|
|
AccDataType v_acc = 0;
|
|
for(std::size_t k = 0; k < K; ++k)
|
|
{
|
|
ADataType v_a = a_m_k(m, k);
|
|
BDataType v_b = b_k_n(k, n);
|
|
v_acc +=
|
|
ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
|
|
}
|
|
|
|
CDataType v_c = 0;
|
|
if constexpr(DsDataType::size() == 0)
|
|
{
|
|
acc_element_op(v_c, ck_tile::type_convert<float>(v_acc));
|
|
}
|
|
else if constexpr(DsDataType::size() == 1)
|
|
{
|
|
acc_element_op(v_c,
|
|
ck_tile::type_convert<float>(v_acc),
|
|
ck_tile::type_convert<float>(ds_m_n[0](m, n)));
|
|
}
|
|
else if constexpr(DsDataType::size() == 2)
|
|
{
|
|
acc_element_op(v_c,
|
|
ck_tile::type_convert<float>(v_acc),
|
|
ck_tile::type_convert<float>(ds_m_n[0](m, n)),
|
|
ck_tile::type_convert<float>(ds_m_n[1](m, n)));
|
|
}
|
|
c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_c);
|
|
};
|
|
|
|
make_ParallelTensorFunctor(f_mk_kn_mn, M, N)(std::thread::hardware_concurrency());
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename LayoutA,
|
|
typename LayoutB,
|
|
typename LayoutC>
|
|
__global__ void naive_gemm_kernel(ADataType* A,
|
|
BDataType* B,
|
|
CDataType* C,
|
|
ck_tile::index_t M,
|
|
ck_tile::index_t N,
|
|
ck_tile::index_t K,
|
|
ck_tile::index_t strideA,
|
|
ck_tile::index_t strideB,
|
|
ck_tile::index_t strideC)
|
|
{
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int row = idx / N; // Compute row index
|
|
int col = idx % N; // Compute column index
|
|
|
|
if(row < M && col < N)
|
|
{
|
|
AccDataType acc = 0.0;
|
|
for(int k = 0; k < K; ++k)
|
|
{
|
|
constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
|
|
constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
|
|
// Adjust indexing based on matrix layout
|
|
int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
|
|
? row * strideA + k
|
|
: k * strideA + row;
|
|
int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
|
|
? col * strideB + k
|
|
: k * strideB + col;
|
|
|
|
AccDataType v_a;
|
|
AccDataType v_b;
|
|
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
|
|
if(k % 2 == 1)
|
|
v_a = fp32_val.hi;
|
|
else
|
|
v_a = fp32_val.lo;
|
|
}
|
|
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
|
|
if(k % 2 == 1)
|
|
v_a = fp32_val.hi;
|
|
else
|
|
v_a = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
|
|
}
|
|
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
|
|
if(k % 2 == 1)
|
|
v_b = fp32_val.hi;
|
|
else
|
|
v_b = fp32_val.lo;
|
|
}
|
|
else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b]);
|
|
if(k % 2 == 1)
|
|
v_b = fp32_val.hi;
|
|
else
|
|
v_b = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
|
|
}
|
|
acc += v_a * v_b;
|
|
}
|
|
|
|
int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
|
|
? row * strideC + col
|
|
: col * strideC + row;
|
|
C[c_index] = ck_tile::type_convert<CDataType>(acc);
|
|
}
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename LayoutA,
|
|
typename LayoutB,
|
|
typename LayoutC>
|
|
__global__ void blockwise_gemm_kernel(ADataType* A,
|
|
BDataType* B,
|
|
CDataType* C,
|
|
ck_tile::index_t M,
|
|
ck_tile::index_t N,
|
|
ck_tile::index_t K,
|
|
ck_tile::index_t strideA,
|
|
ck_tile::index_t strideB,
|
|
ck_tile::index_t strideC,
|
|
ck_tile::index_t scale_granularity_m,
|
|
ck_tile::index_t scale_granularity_n,
|
|
ck_tile::index_t scale_granularity_k,
|
|
float* scale_A_ptr,
|
|
float* scale_B_ptr)
|
|
{
|
|
int idx = blockIdx.x * blockDim.x + threadIdx.x;
|
|
int row = idx / N; // Compute row index
|
|
int col = idx % N; // Compute column index
|
|
|
|
if(row < M && col < N)
|
|
{
|
|
AccDataType acc = 0.0, acc_temp = 0.0;
|
|
|
|
index_t scale_A_stride = (M + scale_granularity_m - 1) / scale_granularity_m;
|
|
index_t scale_B_stride = (N + scale_granularity_n - 1) / scale_granularity_n;
|
|
|
|
float scale_A = 0;
|
|
float scale_B = 0;
|
|
|
|
for(int k = 0; k < K; ++k)
|
|
{
|
|
if(k % scale_granularity_k == 0)
|
|
{
|
|
// update acc
|
|
acc += acc_temp * scale_A * scale_B;
|
|
acc_temp = 0.0;
|
|
// update scale factors
|
|
scale_A = scale_A_ptr[(row / scale_granularity_m) +
|
|
(k / scale_granularity_k) * scale_A_stride];
|
|
scale_B = scale_B_ptr[(col / scale_granularity_n) +
|
|
(k / scale_granularity_k) * scale_B_stride];
|
|
}
|
|
|
|
constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
|
|
constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
|
|
// Adjust indexing based on matrix layout
|
|
int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
|
|
? row * strideA + k
|
|
: k * strideA + row;
|
|
int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
|
|
? col * strideB + k
|
|
: k * strideB + col;
|
|
|
|
AccDataType v_a;
|
|
AccDataType v_b;
|
|
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
|
|
if(k % 2 == 1)
|
|
v_a = fp32_val.hi;
|
|
else
|
|
v_a = fp32_val.lo;
|
|
}
|
|
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
|
|
if(k % 2 == 1)
|
|
v_a = fp32_val.hi;
|
|
else
|
|
v_a = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
|
|
}
|
|
|
|
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
|
|
if(k % 2 == 1)
|
|
v_b = fp32_val.hi;
|
|
else
|
|
v_b = fp32_val.lo;
|
|
}
|
|
else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
|
|
{
|
|
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b], 1.0f);
|
|
if(k % 2 == 1)
|
|
v_b = fp32_val.hi;
|
|
else
|
|
v_b = fp32_val.lo;
|
|
}
|
|
else
|
|
{
|
|
v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
|
|
}
|
|
acc_temp += v_a * v_b;
|
|
}
|
|
// final accumulation
|
|
acc += acc_temp * scale_A * scale_B;
|
|
|
|
int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
|
|
? row * strideC + col
|
|
: col * strideC + row;
|
|
C[c_index] = ck_tile::type_convert<CDataType>(acc);
|
|
}
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename LayoutA,
|
|
typename LayoutB,
|
|
typename LayoutC>
|
|
void reference_gemm_gpu(ADataType* a_ptr,
|
|
BDataType* b_ptr,
|
|
CDataType* c_ptr,
|
|
index_t M,
|
|
index_t N,
|
|
index_t K,
|
|
index_t stride_a,
|
|
index_t stride_b,
|
|
index_t stride_c)
|
|
{
|
|
int totalElements = M * N;
|
|
int numThreadsPerBlock = 256; // Common choice for threads per block
|
|
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
|
|
|
|
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
|
|
<<<numBlocks, numThreadsPerBlock>>>(
|
|
a_ptr, b_ptr, c_ptr, M, N, K, stride_a, stride_b, stride_c);
|
|
|
|
return;
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename LayoutA,
|
|
typename LayoutB,
|
|
typename LayoutC>
|
|
void reference_blockwise_gemm_gpu(ADataType* a_ptr,
|
|
BDataType* b_ptr,
|
|
CDataType* c_ptr,
|
|
index_t M,
|
|
index_t N,
|
|
index_t K,
|
|
index_t stride_a,
|
|
index_t stride_b,
|
|
index_t stride_c,
|
|
index_t scale_granularity_m,
|
|
index_t scale_granularity_n,
|
|
index_t scale_granularity_k,
|
|
float* scale_A_ptr,
|
|
float* scale_B_ptr)
|
|
{
|
|
int totalElements = M * N;
|
|
int numThreadsPerBlock = 256; // Common choice for threads per block
|
|
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
|
|
|
|
blockwise_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
|
|
<<<numBlocks, numThreadsPerBlock>>>(a_ptr,
|
|
b_ptr,
|
|
c_ptr,
|
|
M,
|
|
N,
|
|
K,
|
|
stride_a,
|
|
stride_b,
|
|
stride_c,
|
|
scale_granularity_m,
|
|
scale_granularity_n,
|
|
scale_granularity_k,
|
|
scale_A_ptr,
|
|
scale_B_ptr);
|
|
|
|
return;
|
|
}
|
|
|
|
template <typename ADataType,
|
|
typename BDataType,
|
|
typename AccDataType,
|
|
typename CDataType,
|
|
typename LayoutA,
|
|
typename LayoutB,
|
|
typename LayoutC>
|
|
void reference_batched_gemm_gpu(ADataType* a_ptr,
|
|
BDataType* b_ptr,
|
|
CDataType* c_ptr,
|
|
index_t M,
|
|
index_t N,
|
|
index_t K,
|
|
index_t stride_a,
|
|
index_t stride_b,
|
|
index_t stride_c,
|
|
index_t batch_stride_A,
|
|
index_t batch_stride_B,
|
|
index_t batch_stride_C,
|
|
index_t batch_count)
|
|
{
|
|
int totalElements = M * N;
|
|
int numThreadsPerBlock = 256; // Common choice for threads per block
|
|
int numBlocks = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
|
|
|
|
for(index_t batch_id = 0; batch_id < batch_count; ++batch_id)
|
|
{
|
|
ADataType* d_ATemp = a_ptr + batch_id * batch_stride_A;
|
|
BDataType* d_BTemp = b_ptr + batch_id * batch_stride_B;
|
|
CDataType* d_CTemp = c_ptr + batch_id * batch_stride_C;
|
|
naive_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
|
|
<<<numBlocks, numThreadsPerBlock>>>(
|
|
d_ATemp, d_BTemp, d_CTemp, M, N, K, stride_a, stride_b, stride_c);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
} // namespace ck_tile
|