mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
[CK_TILE] Add mxfp4 flatmm (#3080)
* Squashed commit of the following: commit 3e1a851dad834776efbe4fe365ac82c4ed312010 Author: Ding, Yi <yi.ding@amd.com> Date: Thu Oct 23 06:10:54 2025 +0000 Fix & clean after rebase commit 1edf485092f44411da9a1796a4a6b72d5cdb67c6 Author: Ding, Yi <yi.ding@amd.com> Date: Wed Oct 22 10:46:13 2025 +0000 Squashed commit of the following: commit0b6b9dbd1bAuthor: mtgu0705 <mtgu@amd.com> Date: Mon Sep 22 02:04:27 2025 -0500 fix bandwidth calculation commit9aebf53bb7Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 22 00:58:59 2025 -0500 updates commit62607de56cAuthor: mtgu0705 <mtgu@amd.com> Date: Fri Sep 19 00:39:46 2025 -0500 fix a bug, set the A DS_read preload size to 4 for MXFP4 commit92ad6fcc0aAuthor: mtgu0705 <mtgu@amd.com> Date: Thu Sep 18 01:19:03 2025 -0500 fix a_wrap preload issue for large MPerBlock. commitf2db44710fAuthor: mtgu0705 <mtgu@amd.com> Date: Wed Sep 17 21:34:03 2025 -0500 optimized the VGPR repack issue for MXFP4 commit346a400027Author: Gino Lu <gino.lu@amd.com> Date: Wed Sep 17 04:19:44 2025 -0500 fix time error commit80c1743034Author: mtgu0705 <mtgu@amd.com> Date: Wed Sep 17 03:58:00 2025 -0500 updated, function passed. commitce26d9071eAuthor: mtgu0705 <mtgu@amd.com> Date: Tue Sep 16 22:21:39 2025 -0500 fix, function partially passed commit0a89ed13a5Author: mtgu0705 <mtgu@amd.com> Date: Tue Sep 16 03:01:12 2025 -0500 fix, reference function passed, next check kernel function commitec9bcef591Author: Gino Lu <gino.lu@amd.com> Date: Tue Sep 16 02:29:01 2025 -0500 let pack/unpack return pk_fp4_t commita333206929Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 20:50:26 2025 -0500 fix commit3893c06540Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 15 05:51:06 2025 -0500 fix bug commit8052bea019Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 04:02:05 2025 -0500 fix core dump issue, function is not correct. commit9ceb3fd508Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 03:03:02 2025 -0500 updates, build pass commitcc94eb6045Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 15 00:05:18 2025 -0500 updates commit22586c3135Author: Gino Lu <gino.lu@amd.com> Date: Sun Sep 14 23:40:28 2025 -0500 fix bug commite92e67b8ddAuthor: Gino Lu <gino.lu@amd.com> Date: Fri Sep 12 03:28:50 2025 -0500 fix interface commit8b1dd60c08Author: Gino Lu <gino.lu@amd.com> Date: Fri Sep 12 02:53:50 2025 -0500 add interface in warp_gemm_impl commitc6135f6abeAuthor: mtgu0705 <mtgu@amd.com> Date: Wed Sep 10 05:03:08 2025 -0500 updates some fixes. commitb0d71b8d19Author: mtgu0705 <mtgu@amd.com> Date: Tue Sep 9 04:37:42 2025 -0500 fix after merge ginolu/add_wgmfma_dispatcher commitf119c30317Merge:c5030e60272c8ef856Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 8 22:09:15 2025 -0500 Merge remote-tracking branch 'origin/ginolu/add_wgmfma_dispatcher' into mtgu/cktile_mxfp4_flatmm_dev commitc5030e602eAuthor: mtgu0705 <mtgu@amd.com> Date: Mon Sep 8 21:42:47 2025 -0500 update mx flatmm tail pipeline commit72c8ef8567Merge:9661bb400e4a772890Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 8 19:10:23 2025 -0500 Merge branch 'develop' into ginolu/add_wgmfma_dispatcher commit9661bb400bAuthor: Gino Lu <gino.lu@amd.com> Date: Mon Sep 8 19:09:55 2025 -0500 fix type error commit0509597f55Author: mtgu0705 <mtgu@amd.com> Date: Mon Sep 8 04:01:40 2025 -0500 update hotloop pipeline commit754ae0461bMerge:15d44406e83f607e2aAuthor: Gino Lu <gino.lu@amd.com> Date: Fri Sep 5 04:22:26 2025 -0500 Merge branch 'develop' into ginolu/add_wgmfma_dispatcher commit15d44406e5Author: Gino Lu <gino.lu@amd.com> Date: Fri Sep 5 04:21:26 2025 -0500 fix clang format commit146963d62aAuthor: mtgu0705 <mtgu@amd.com> Date: Wed Sep 3 10:00:54 2025 -0500 some updates commit12526b626aMerge:47cee047100fd72b2dAuthor: asleepzzz <hanwen.chang@amd.com> Date: Wed Sep 3 13:22:03 2025 +0800 Merge branch 'develop' into ginolu/add_wgmfma_dispatcher commit47cee04712Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 1 02:11:02 2025 -0500 fix vec size error commitd2892925e5Author: Gino Lu <gino.lu@amd.com> Date: Mon Sep 1 01:23:39 2025 -0500 fix format error commit16993acd1dAuthor: mtgu0705 <mtgu@amd.com> Date: Sat Aug 30 03:19:07 2025 -0500 update codes commit9c37e55d13Author: mtgu0705 <mtgu@amd.com> Date: Fri Aug 29 11:27:33 2025 -0500 init ck_tile mxfp4 flatmm commit5c484a5672Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 28 08:02:50 2025 +0000 Add bias for f16xf4 moe_flatmm commitdd6539f366Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 27 13:39:47 2025 +0000 update case construction commit65b702454cAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Tue Aug 26 12:32:29 2025 +0000 support swiglu activaion and use rcpf to accelerate silu commitb422e41e08Author: Gino Lu <gino.lu@amd.com> Date: Tue Aug 26 02:33:55 2025 -0500 first commit commitd05eed931dAuthor: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu> Date: Fri Aug 22 04:01:59 2025 -0500 add line to last commitd69cab7f0cAuthor: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu> Date: Fri Aug 22 03:20:46 2025 -0500 adjust A_LDS descriptor to avoid bankconflict commit65989e940cAuthor: root <root@smci355-ccs-aus-m02-25.cs-aus.dcgpu> Date: Thu Aug 21 09:46:52 2025 -0500 enable hotloop commitc378e9bdf8Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 21 09:12:21 2025 +0000 support atomic_pk_add_bf16 on gfx950 commit85976b0b87Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 21 06:58:55 2025 +0000 use int64_t as expert stride to avoid overflow commit9fbcc8f8a4Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 20 13:53:32 2025 +0000 use v4i32 as the storage type for B to avoid repack operation commit81899bd920Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 20 06:40:03 2025 +0000 add pk_fp4_t and e8m0_t support for amd_buffer_load_impl commitc27eb0771aAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 20 04:39:14 2025 +0000 optimize cvt_pkf4_to_f16 implementation commit3ca0bd500aAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Tue Aug 19 14:56:46 2025 +0000 optimize A_LDS descriptor to avoid bankconflict commitf7f0306eeaAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 18 18:43:37 2025 +0000 fix gate-up when GU_NRepeat > 1 commitbe55c0f9cbAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 18 17:28:11 2025 +0000 add fp16xf4 moe commit599e1f5b32Author: Feng Shijie <Shijie.Feng@amd.com> Date: Sun Aug 17 17:51:18 2025 +0000 rename example commit7899fb4a8dAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 15 06:20:46 2025 +0000 remove additional check when e8m0->float commit714b341797Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 14 09:34:12 2025 +0000 eliminate repeat dequant commit53e8c0c533Merge:5de620895cc9c7b9e5Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 13 16:51:49 2025 +0000 Merge remote-tracking branch 'origin/moe_flatmm' into feat-mixed_input_flatmm commit5de6208952Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 13 16:16:48 2025 +0000 update f16xMXF4 commit732ebdee8bAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 13 10:48:53 2025 +0000 update scale-preshuffle for MXF4 commitedb58d0680Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 11:24:34 2025 +0000 update commitcc9c7b9e58Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 08:38:23 2025 +0000 optimize gemm2 atomic_add pattern commit200a11afc8Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 07:59:47 2025 +0000 update scale for mxfp4 commit87aed564dcAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 07:56:14 2025 +0000 update case construction commit8b85fa6cf2Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 06:03:06 2025 +0000 update granularity control commit1b8c7097b8Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 11 03:42:46 2025 +0000 fix TileConfig commit8ba1c708dcAuthor: Gino Lu <gino.lu@amd.com> Date: Thu Aug 7 21:37:28 2025 +0800 Add e8m0 scaled convert into CK_TILE (#2617) * first commit * remove redundent code * modify according to comments. * fix type_convert error with scaled_type_convert commitf788d3d629Author: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 8 20:19:16 2025 +0000 add mixed_prec fp16xfp4 commit3dea10a277Author: Feng Shijie <Shijie.Feng@amd.com> Date: Thu Aug 7 09:22:04 2025 +0000 debug mixed_prec flatmm commit0ba513b148Merge:90e910f3ac0cb4d036Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Aug 6 16:49:47 2025 +0800 Merge pull request #2626 from ROCm/felix/flatmm_fix_splitk fix split k commit6d3cbc7c0eAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Aug 6 08:33:33 2025 +0000 add moe_flatmm commitc0cb4d036dAuthor: coderfeli <coderfeli@163.com> Date: Wed Aug 6 02:45:31 2025 +0000 fix split k commit90e910f3a7Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Aug 4 07:16:36 2025 +0000 fix flatmm with scaling when WarpTileM == 32 commitaa5e008fa5Author: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 1 11:01:23 2025 +0000 optimize scaling epilogue commitac5908c0bbAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Aug 1 07:28:38 2025 +0000 fix wrong config for fp8 scaling commit3f43b841d4Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 30 06:20:30 2025 +0000 prune debug message commit2e5d4c74cdAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 30 04:52:08 2025 +0000 fix compile error commitc117a1986aAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Tue Jul 29 15:42:58 2025 +0000 Add persistent option on flatmm for tuning commita587701117Author: AMD-dteng <dteng@amd.com> Date: Tue Jul 29 22:48:00 2025 +0800 update pipeline v1: add atomic IGLP schedule commitf9e48148d2Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 24 09:09:27 2025 +0000 fix error log throwing commit1b6d7cf407Author: Feng Shijie <Shijie.Feng@amd.com> Date: Mon Jul 28 08:24:51 2025 +0000 crz idea commit5473f06461Author: Feng Shijie <Shijie.Feng@amd.com> Date: Sun Jul 27 11:57:38 2025 +0000 Add permuteN optimzization when NRepeat % 2 == 0 on flatmm commitbfb9f4002fAuthor: sjfeng <j514681085@icloud.com> Date: Sun Jul 27 17:24:08 2025 +0800 try to remove c_shuffle_lds commit1264f4d2abAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Fri Jul 25 07:41:48 2025 +0000 fix loop-dim mismatch and improve c_shuffle alu parallelism commit1239d8a546Merge:406645448b908f5e80Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 24 08:46:51 2025 +0000 merge flatmm -scale commit4066454483Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 24 16:19:58 2025 +0800 revert delete of inc file commit68390988c9Author: solin <bingzhou@amd.com> Date: Thu Jul 24 04:38:16 2025 +0000 reorg flatmm code commitb908f5e803Author: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 23 19:12:31 2025 +0000 fix flatmm syntax error on gfx950 commit5a1183ebbdAuthor: Feng Shijie <Shijie.Feng@amd.com> Date: Wed Jul 23 19:04:22 2025 +0000 support flatmm scaling commit89fa639207Author: valarLip <340077269@qq.com> Date: Wed Jul 23 08:44:12 2025 +0000 merge flatmm pipe v0 from dteng_flatmm_opt commit3f7d848dd3Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 23 15:38:12 2025 +0800 build pass commit6dacf833daAuthor: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 23 07:20:26 2025 +0000 fix bug commit7e1bd4b839Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 23 15:01:53 2025 +0800 sync commit46a538e39eAuthor: valarLip <340077269@qq.com> Date: Tue Jul 22 08:09:35 2025 +0000 adaptive scheduler instead of Macro definition commit9aa3396a79Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Thu Jul 17 08:40:35 2025 +0000 fix tail handler bug commitfb76450e63Author: lalala-sh <Jiaxing.Wen@amd.com> Date: Wed Jul 16 10:12:19 2025 +0000 merge from dteng_flatmm_opt --------- Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com> Co-authored-by: AMD-dteng <dteng@amd.com> Co-authored-by: solin <bingzhou@amd.com> Co-authored-by: sjfeng <j514681085@icloud.com> Co-authored-by: valarLip <340077269@qq.com> Co-authored-by: asleepzzz <hanwen.chang@amd.com> Co-authored-by: Feng Shijie <Shijie.Feng@amd.com> Co-authored-by: coderfeli <coderfeli@163.com> Co-authored-by: Gino Lu <gino.lu@amd.com> Co-authored-by: mtgu0705 <mtgu@amd.com> * Fix crash on small M * Apply suggestion from @Copilot --------- Co-authored-by: lalala-sh <Jiaxing.Wen@amd.com> Co-authored-by: AMD-dteng <dteng@amd.com> Co-authored-by: solin <bingzhou@amd.com> Co-authored-by: sjfeng <j514681085@icloud.com> Co-authored-by: valarLip <340077269@qq.com> Co-authored-by: asleepzzz <hanwen.chang@amd.com> Co-authored-by: Feng Shijie <Shijie.Feng@amd.com> Co-authored-by: coderfeli <coderfeli@163.com> Co-authored-by: Gino Lu <gino.lu@amd.com> Co-authored-by: mtgu0705 <mtgu@amd.com>
This commit is contained in:
@@ -14,6 +14,7 @@ if(has_supported_gpu)
|
||||
add_executable(tile_example_moe_flatmm EXCLUDE_FROM_ALL moe_flatmm.cpp)
|
||||
add_executable(tile_example_a16w4_moe_flatmm EXCLUDE_FROM_ALL mixed_prec/a16w4_moe_flatmm.cpp)
|
||||
add_executable(tile_example_grouped_flatmm EXCLUDE_FROM_ALL grouped_flatmm.cpp)
|
||||
add_executable(tile_example_mx_flatmm EXCLUDE_FROM_ALL mxgemm/mx_flatmm.cpp) # TODO: 950 only
|
||||
|
||||
set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
|
||||
set(EXAMPLE_MOE_FLATMM_COMPILE_OPTIONS)
|
||||
@@ -27,6 +28,6 @@ if(has_supported_gpu)
|
||||
target_compile_options(tile_example_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_a16w4_moe_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
target_compile_options(tile_example_grouped_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
|
||||
|
||||
target_compile_options(tile_example_mx_flatmm PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS}) # TODO: 950 only
|
||||
endif()
|
||||
|
||||
|
||||
506
example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp
Normal file
506
example/ck_tile/18_flatmm/mxgemm/mx_flatmm.cpp
Normal file
@@ -0,0 +1,506 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "mx_flatmm.hpp"
|
||||
|
||||
template <typename Layout>
|
||||
static constexpr inline auto is_row_major(Layout layout_)
|
||||
{
|
||||
return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
|
||||
ck_tile::tensor_layout::gemm::RowMajor>>{};
|
||||
}
|
||||
|
||||
template <typename FlatmmConfig,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename DsDatatype,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename DsLayout,
|
||||
typename ELayout,
|
||||
typename ScaleM,
|
||||
typename ScaleN,
|
||||
bool persistent,
|
||||
typename CDEElementWise>
|
||||
float mx_flatmm_calc(const ck_tile::ScaleFlatmmHostArgs<ScaleM, ScaleN>& args,
|
||||
const ck_tile::stream_config& s)
|
||||
{
|
||||
using CodegenFlatmmShape = ck_tile::TileGemmShape<
|
||||
ck_tile::sequence<FlatmmConfig::M_Tile, FlatmmConfig::N_Tile, FlatmmConfig::K_Tile>,
|
||||
ck_tile::sequence<FlatmmConfig::M_Warp, FlatmmConfig::N_Warp, FlatmmConfig::K_Warp>,
|
||||
ck_tile::sequence<FlatmmConfig::M_Warp_Tile,
|
||||
FlatmmConfig::N_Warp_Tile,
|
||||
FlatmmConfig::K_Warp_Tile>>;
|
||||
|
||||
using TilePartitioner =
|
||||
ck_tile::GemmSpatiallyLocalTilePartitioner<CodegenFlatmmShape,
|
||||
FlatmmConfig::TileParitionerGroupNum,
|
||||
FlatmmConfig::TileParitionerM01>;
|
||||
|
||||
using Traits = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
|
||||
FlatmmConfig::kPadN,
|
||||
FlatmmConfig::kPadK,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout,
|
||||
FlatmmConfig::NumWaveGroups>;
|
||||
|
||||
using CodegenGemmTraits = ck_tile::TileGemmUniversalTraits<FlatmmConfig::kPadM,
|
||||
FlatmmConfig::kPadN,
|
||||
FlatmmConfig::kPadK,
|
||||
FlatmmConfig::DoubleSmemBuffer,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ELayout,
|
||||
FlatmmConfig::TransposeC,
|
||||
FlatmmConfig::UseStructuredSparsity,
|
||||
persistent,
|
||||
FlatmmConfig::NumWaveGroups,
|
||||
true>;
|
||||
|
||||
using ComputeDataType = ADataType;
|
||||
static_assert(sizeof(ComputeDataType) >= sizeof(BDataType),
|
||||
"mixed_prec_flatmm requires ADataType is a wider type than BDataType");
|
||||
|
||||
using GemmPipelineProblem = ck_tile::GemmPipelineProblem<ComputeDataType,
|
||||
ComputeDataType,
|
||||
AccDataType,
|
||||
CodegenFlatmmShape,
|
||||
Traits>;
|
||||
|
||||
using BaseGemmPipeline = ck_tile::BaseFlatmmPipelineAGmemBGmemCRegV1<GemmPipelineProblem>;
|
||||
|
||||
const ck_tile::index_t k_grain = args.k_batch * FlatmmConfig::K_Tile;
|
||||
const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * FlatmmConfig::K_Tile;
|
||||
const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split);
|
||||
const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop);
|
||||
const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
float ave_time{0};
|
||||
|
||||
const auto Run = [&](const auto has_hot_loop_,
|
||||
const auto tail_number_,
|
||||
const auto memory_operation_) {
|
||||
constexpr bool has_hot_loop_v = has_hot_loop_.value;
|
||||
constexpr auto tail_number_v = tail_number_.value;
|
||||
constexpr auto scheduler = FlatmmConfig::Scheduler;
|
||||
constexpr auto memory_operation = memory_operation_.value;
|
||||
|
||||
constexpr int BlockedXDLN_PerWarp = 2; // determined by scale shuffle pattern
|
||||
|
||||
using CodegenPipelineProblem = ck_tile::MXFlatmmPipelineProblem<ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CodegenFlatmmShape,
|
||||
CodegenGemmTraits,
|
||||
scheduler,
|
||||
has_hot_loop_v,
|
||||
tail_number_v>;
|
||||
|
||||
using CodegenMXFlatmmPipeline =
|
||||
ck_tile::MXF4FlatmmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
|
||||
|
||||
using GemmEpilogue = ck_tile::CShuffleEpilogue<
|
||||
ck_tile::CShuffleEpilogueProblem<ComputeDataType,
|
||||
ComputeDataType,
|
||||
DsDatatype,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
DsLayout,
|
||||
ELayout,
|
||||
CDEElementWise,
|
||||
TilePartitioner::MPerBlock,
|
||||
TilePartitioner::NPerBlock,
|
||||
FlatmmConfig::M_Warp,
|
||||
FlatmmConfig::N_Warp,
|
||||
FlatmmConfig::M_Warp_Tile,
|
||||
FlatmmConfig::N_Warp_Tile,
|
||||
FlatmmConfig::K_Warp_Tile,
|
||||
CodegenPipelineProblem::TransposeC,
|
||||
memory_operation,
|
||||
FlatmmConfig::NumWaveGroups,
|
||||
false, // FixedVectorSize
|
||||
1, // VectorSizeC
|
||||
FlatmmConfig::TiledMMAPermuteN,
|
||||
BlockedXDLN_PerWarp>>;
|
||||
|
||||
using Kernel =
|
||||
ck_tile::MXFlatmmKernel<TilePartitioner, CodegenMXFlatmmPipeline, GemmEpilogue>;
|
||||
|
||||
auto kargs = Kernel::MakeKernelArgs(args);
|
||||
|
||||
const dim3 grids = Kernel::GridSize(kargs);
|
||||
constexpr dim3 blocks = Kernel::BlockSize();
|
||||
|
||||
if(!Kernel::IsSupportedArgument(kargs))
|
||||
{
|
||||
throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
|
||||
}
|
||||
|
||||
if(s.log_level_ > 0)
|
||||
{
|
||||
std::cout << "Launching kernel with args:" << CodegenFlatmmShape::GetName() << "\n"
|
||||
<< "Shape: " << CodegenFlatmmShape::GetName() << "\n"
|
||||
<< "problem: " << CodegenPipelineProblem::GetName() << "\n"
|
||||
<< "pipeline: " << CodegenMXFlatmmPipeline::GetName() << "\n"
|
||||
<< "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
|
||||
<< ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
|
||||
<< std::endl;
|
||||
}
|
||||
|
||||
// Declare rotating_mem_ptr here so it stays in scope until it is needed
|
||||
std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
|
||||
std::function<void()> preprocess;
|
||||
|
||||
auto clear_gemm_output = [&]() {
|
||||
if(args.k_batch > 1)
|
||||
hipGetErrorString(hipMemsetAsync(
|
||||
args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
|
||||
};
|
||||
|
||||
if(s.flush_cache_)
|
||||
{
|
||||
std::cout << "Flushing cache..." << std::endl;
|
||||
constexpr ck_tile::index_t APackedSize = ck_tile::numeric_traits<ADataType>::PackedSize;
|
||||
constexpr ck_tile::index_t BPackedSize = ck_tile::numeric_traits<BDataType>::PackedSize;
|
||||
|
||||
ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
|
||||
args.M, args.K, args.stride_A, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
|
||||
args.K, args.N, args.stride_B, is_row_major(BLayout{})));
|
||||
|
||||
auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
|
||||
auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
|
||||
|
||||
rotating_mem_ptr = std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
|
||||
kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
|
||||
rotating_mem_ptr->Print();
|
||||
|
||||
preprocess = [&]() {
|
||||
ck_tile::flush_icache();
|
||||
rotating_mem_ptr->Next();
|
||||
clear_gemm_output();
|
||||
};
|
||||
}
|
||||
else
|
||||
{
|
||||
preprocess = clear_gemm_output;
|
||||
}
|
||||
|
||||
ave_time = ck_tile::launch_kernel_time_mask(
|
||||
s,
|
||||
preprocess,
|
||||
ck_tile::make_kernel<FlatmmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
|
||||
return ave_time;
|
||||
};
|
||||
|
||||
const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
|
||||
if(args.k_batch == 1)
|
||||
{
|
||||
Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
ck_tile::integral_constant<ck_tile::memory_operation_enum,
|
||||
ck_tile::memory_operation_enum::set>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
Run(has_hot_loop_,
|
||||
tail_number_,
|
||||
ck_tile::integral_constant<ck_tile::memory_operation_enum,
|
||||
ck_tile::memory_operation_enum::atomic_add>{});
|
||||
}
|
||||
};
|
||||
BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
template <typename FlatmmConfig,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename DsDatatype,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename DsLayout,
|
||||
typename CLayout,
|
||||
typename ScaleA,
|
||||
typename ScaleB,
|
||||
bool UsePersistentKernel = false,
|
||||
typename CDEElementWise = ck_tile::element_wise::PassThrough>
|
||||
float invoke_mx_flatmm(ck_tile::DeviceMem& a_dev_buf,
|
||||
ck_tile::DeviceMem& b_shuffle_dev_buf,
|
||||
ck_tile::DeviceMem& c_dev_buf,
|
||||
ck_tile::index_t M,
|
||||
ck_tile::index_t N,
|
||||
ck_tile::index_t K,
|
||||
ck_tile::index_t stride_A,
|
||||
ck_tile::index_t stride_B,
|
||||
ck_tile::index_t stride_C,
|
||||
ck_tile::index_t kbatch,
|
||||
ScaleA scale_a,
|
||||
ScaleB scale_b,
|
||||
int n_warmup,
|
||||
int n_repeat)
|
||||
{
|
||||
ck_tile::ScaleFlatmmHostArgs<ScaleA, ScaleB> args = {a_dev_buf.GetDeviceBuffer(),
|
||||
b_shuffle_dev_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
c_dev_buf.GetDeviceBuffer(),
|
||||
kbatch,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
stride_A,
|
||||
stride_B,
|
||||
{},
|
||||
stride_C,
|
||||
scale_a,
|
||||
scale_b};
|
||||
|
||||
float ave_time = mx_flatmm_calc<FlatmmConfig,
|
||||
ADataType,
|
||||
BDataType,
|
||||
DsDatatype,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
DsLayout,
|
||||
CLayout,
|
||||
ScaleA,
|
||||
ScaleB,
|
||||
UsePersistentKernel,
|
||||
CDEElementWise>(
|
||||
args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
|
||||
|
||||
constexpr int APackedSize = ck_tile::numeric_traits<ADataType>::PackedSize;
|
||||
constexpr int BPackedSize = ck_tile::numeric_traits<BDataType>::PackedSize;
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / 32;
|
||||
std::size_t num_byte = sizeof(ADataType) * M * K / APackedSize +
|
||||
sizeof(BDataType) * N * K / BPackedSize + sizeof(CDataType) * M * N +
|
||||
sizeof(ck_tile::e8m0_t) * M * K / 32 +
|
||||
sizeof(ck_tile::e8m0_t) * N * K / 32;
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Run MXFP4_Flatmm kernel " //
|
||||
<< " M =" << M << " N =" << N << " K =" << K << " StrideA =" << stride_A
|
||||
<< " StrideB =" << stride_B << " StrideC =" << stride_C << " : " << ave_time
|
||||
<< " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
|
||||
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "32", "m dimension")
|
||||
.insert("n", "128", "n dimension")
|
||||
.insert("k", "256", "k dimension")
|
||||
.insert("a_layout", "R", "A tensor data layout - Row by default")
|
||||
.insert("b_layout", "C", "B tensor data layout - Row by default")
|
||||
.insert("c_layout", "R", "C tensor data layout - Row by default")
|
||||
.insert("stride_a", "0", "Tensor A stride")
|
||||
.insert("stride_b", "0", "Tensor B stride")
|
||||
.insert("stride_c", "0", "Tensor C stride")
|
||||
.insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
|
||||
.insert(
|
||||
"mx_prec", "fp4xfp4", "data type for activation and weight, support: fp6xfp6, fp8xfp8")
|
||||
.insert("warmup", "50", "number of iterations before benchmark the kernel")
|
||||
.insert("repeat", "100", "number of iterations to benchmark the kernel")
|
||||
.insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
|
||||
.insert("split_k", "1", "splitK value")
|
||||
.insert("init", "0", "0:random, 1:constant(1)")
|
||||
.insert("persistent", "0", "0: no persistent, 1: persistent kernel")
|
||||
.insert("warp_tile",
|
||||
"0",
|
||||
"0: 16x16, 1: 32x32, 2: 16x16x128 (950 only), 3: 32x32x64 (950 only)");
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
template <class FlatmmConfig, class IterSrc, class IterDst>
|
||||
void preShuffleWeight(const IterSrc src, IterDst dst, int N, int K)
|
||||
{
|
||||
int KPack = 16;
|
||||
int NLane = FlatmmConfig::N_Warp_Tile;
|
||||
int KLane = 64 / NLane;
|
||||
int K_pk = K / 2;
|
||||
int K0 = K_pk / (KLane * KPack);
|
||||
// K -> K0 KLane KPack
|
||||
// N -> N0 NLane
|
||||
// N, K -> N0 K0 KLane NLane KPack
|
||||
int tempk;
|
||||
for(int n = 0; n < N; ++n)
|
||||
{
|
||||
for(int k = 0; k < K_pk; ++k)
|
||||
{
|
||||
int n0 = n / NLane;
|
||||
int n1 = n % NLane;
|
||||
|
||||
int k0 = k / (KLane * KPack);
|
||||
tempk = k % (KLane * KPack);
|
||||
int k1 = tempk / KPack;
|
||||
int k2 = tempk % KPack;
|
||||
|
||||
int outputIndex = n0 * KPack * NLane * KLane * K0 + k0 * KPack * NLane * KLane +
|
||||
k1 * KPack * NLane + n1 * KPack + k2;
|
||||
|
||||
dst[outputIndex] = src[n * K_pk + k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class FlatmmConfig, bool KLast, typename Src>
|
||||
auto preShuffleScale(Src& src)
|
||||
{
|
||||
using dtype = typename Src::Data::value_type;
|
||||
auto src_lengths = src.get_lengths();
|
||||
const auto MN = KLast ? src_lengths[0] : src_lengths[1];
|
||||
const auto K = KLast ? src_lengths[1] : src_lengths[0];
|
||||
|
||||
size_t MNXdlPack = 2;
|
||||
size_t KXdlPack = 2;
|
||||
size_t XdlMNThread = FlatmmConfig::N_Warp_Tile; // 16
|
||||
size_t XdlKThread = 64 / XdlMNThread;
|
||||
|
||||
const auto MN_Paded = ck_tile::integer_least_multiple(MN, XdlMNThread * MNXdlPack);
|
||||
|
||||
ck_tile::HostTensor<dtype> shuffled(ck_tile::HostTensorDescriptor({MN_Paded * K}, {1}));
|
||||
|
||||
size_t K0 = K / KXdlPack / XdlKThread; // KRepeat
|
||||
|
||||
// The 4 16x128 building blocks will be packed into 1 32x256 for F4
|
||||
// The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
|
||||
|
||||
// unfold the MN32xK(256/32) scale buffer
|
||||
// 4 16 2 2
|
||||
// To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
|
||||
// Then, MNRepeat->KRepeat
|
||||
|
||||
for(size_t n = 0; n < MN_Paded; ++n)
|
||||
{
|
||||
for(size_t k = 0; k < K; ++k)
|
||||
{
|
||||
auto n0 = n / (XdlMNThread * MNXdlPack); // i MNRepeat
|
||||
auto tempn = n % (XdlMNThread * MNXdlPack);
|
||||
auto n1 = tempn % XdlMNThread; // i XdlMNThread
|
||||
auto n2 = tempn / XdlMNThread; // i MNXdlPack
|
||||
|
||||
auto k0 = k / (XdlKThread * KXdlPack); // i KRepeat
|
||||
auto tempk = k % (XdlKThread * KXdlPack);
|
||||
auto k1 = tempk % XdlKThread; // i XdlKThread
|
||||
auto k2 = tempk / XdlKThread; // i KXdlPack
|
||||
|
||||
auto outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
|
||||
k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
|
||||
k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
|
||||
k2 * MNXdlPack + n2;
|
||||
|
||||
if constexpr(KLast)
|
||||
shuffled(outputIndex) = n < MN ? src(n, k) : dtype{};
|
||||
else
|
||||
shuffled(outputIndex) = n < MN ? src(k, n) : dtype{};
|
||||
}
|
||||
}
|
||||
return shuffled;
|
||||
}
|
||||
|
||||
#include "run_mx_flatmm.inc"
|
||||
|
||||
template <typename FlatmmConfig>
|
||||
int run_mx_flatmm_example(int argc, char* argv[])
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
using Row = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
std::string mx_prec = arg_parser.get_str("mx_prec");
|
||||
std::string a_layout = arg_parser.get_str("a_layout");
|
||||
std::string b_layout = arg_parser.get_str("b_layout");
|
||||
int persistent_opt = arg_parser.get_int("persistent");
|
||||
|
||||
if(a_layout == "R" && b_layout == "C")
|
||||
{
|
||||
if(mx_prec == "fp4xfp4")
|
||||
{
|
||||
if(persistent_opt == 0)
|
||||
{
|
||||
run_mx_flatmm_with_layouts<ck_tile::pk_fp4_t,
|
||||
ck_tile::pk_fp4_t,
|
||||
ck_tile::fp16_t,
|
||||
FlatmmConfig,
|
||||
false>(argc, argv, Row{}, Col{}, Row{});
|
||||
}
|
||||
else
|
||||
{
|
||||
run_mx_flatmm_with_layouts<ck_tile::pk_fp4_t,
|
||||
ck_tile::pk_fp4_t,
|
||||
ck_tile::fp16_t,
|
||||
FlatmmConfig,
|
||||
true>(argc, argv, Row{}, Col{}, Row{});
|
||||
}
|
||||
}
|
||||
else if(mx_prec == "fp6xfp6")
|
||||
{
|
||||
throw std::runtime_error("Only support fp4xfp4 now!");
|
||||
}
|
||||
else if(mx_prec == "fp8xfp8")
|
||||
{
|
||||
throw std::runtime_error("Only support fp4xfp4 now!");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported data_type!");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported data layout configuration for A,B and C tensors!");
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[])
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return EXIT_FAILURE;
|
||||
try
|
||||
{
|
||||
int warp_tile = arg_parser.get_int("warp_tile");
|
||||
if(warp_tile == 0)
|
||||
{
|
||||
return !run_mx_flatmm_example<MXfp4_FlatmmConfig16>(argc, argv);
|
||||
}
|
||||
else if(warp_tile == 1)
|
||||
{
|
||||
throw std::runtime_error("Only support MFMA_16x16x128 now!");
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported warp_tile!");
|
||||
}
|
||||
}
|
||||
catch(const std::runtime_error& e)
|
||||
{
|
||||
std::cerr << "Runtime error: " << e.what() << '\n';
|
||||
return EXIT_FAILURE;
|
||||
}
|
||||
}
|
||||
15
example/ck_tile/18_flatmm/mxgemm/mx_flatmm.hpp
Normal file
15
example/ck_tile/18_flatmm/mxgemm/mx_flatmm.hpp
Normal file
@@ -0,0 +1,15 @@
|
||||
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/flatmm.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
|
||||
#include "mxfp4_flatmm.hpp"
|
||||
40
example/ck_tile/18_flatmm/mxgemm/mxfp4_flatmm.hpp
Normal file
40
example/ck_tile/18_flatmm/mxgemm/mxfp4_flatmm.hpp
Normal file
@@ -0,0 +1,40 @@
|
||||
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
|
||||
// GEMM config with 16x16 warp tile
|
||||
struct MXfp4_FlatmmConfig16
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 128;
|
||||
static constexpr ck_tile::index_t N_Tile = 512;
|
||||
static constexpr ck_tile::index_t K_Tile = 256;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp = 1;
|
||||
static constexpr ck_tile::index_t N_Warp = 4;
|
||||
static constexpr ck_tile::index_t K_Warp = 1;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t N_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t K_Warp_Tile = 128;
|
||||
|
||||
static constexpr bool kPadM = false;
|
||||
static constexpr bool kPadN = false;
|
||||
static constexpr bool kPadK = false;
|
||||
|
||||
static constexpr bool TransposeC = false;
|
||||
static constexpr bool UseStructuredSparsity = false;
|
||||
|
||||
static constexpr int kBlockPerCu = 1;
|
||||
static constexpr int TileParitionerGroupNum = 8;
|
||||
static constexpr int TileParitionerM01 = 4;
|
||||
static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Default;
|
||||
static constexpr ck_tile::index_t NumWaveGroups = 1;
|
||||
static constexpr bool DoubleSmemBuffer = false;
|
||||
|
||||
static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp;
|
||||
static constexpr bool TiledMMAPermuteN = false;
|
||||
};
|
||||
167
example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc
Normal file
167
example/ck_tile/18_flatmm/mxgemm/run_mx_flatmm.inc
Normal file
@@ -0,0 +1,167 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
template <typename PrecActType,
|
||||
typename PrecWeightType,
|
||||
typename CDataType,
|
||||
typename FlatmmConfig,
|
||||
bool UsePersistentKernel = false,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout>
|
||||
int run_mx_flatmm_with_layouts(int argc,
|
||||
char* argv[],
|
||||
const ALayout a_layout = ALayout{},
|
||||
const BLayout b_layout = BLayout{},
|
||||
const CLayout c_layout = CLayout{})
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
using ADataType = PrecActType;
|
||||
using BDataType = PrecWeightType;
|
||||
using AccDataType = float;
|
||||
|
||||
using ScaleType = ck_tile::e8m0_t;
|
||||
|
||||
constexpr int ScaleGranularityM = 1;
|
||||
constexpr int ScaleGranularityN = 1;
|
||||
constexpr int ScaleGranularityK = 32;
|
||||
|
||||
ck_tile::index_t M = arg_parser.get_int("m");
|
||||
ck_tile::index_t N = arg_parser.get_int("n");
|
||||
ck_tile::index_t K = arg_parser.get_int("k");
|
||||
|
||||
ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
|
||||
ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
|
||||
ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
|
||||
|
||||
ck_tile::index_t kbatch = arg_parser.get_int("split_k");
|
||||
ck_tile::index_t init_method = arg_parser.get_int("init");
|
||||
ck_tile::index_t n_warmup = arg_parser.get_int("warmup");
|
||||
ck_tile::index_t n_repeat = arg_parser.get_int("repeat");
|
||||
|
||||
stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
|
||||
stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
|
||||
stride_C = ck_tile::get_default_stride(M, N, stride_C, is_row_major(c_layout));
|
||||
|
||||
auto scale_stride_A = ck_tile::get_default_stride(
|
||||
M / ScaleGranularityM, K / ScaleGranularityK, 0, is_row_major(a_layout));
|
||||
auto scale_stride_B = ck_tile::get_default_stride(
|
||||
K / ScaleGranularityK, N / ScaleGranularityN, 0, is_row_major(b_layout));
|
||||
|
||||
if(K % ScaleGranularityK != 0)
|
||||
throw std::runtime_error("wrong! K must be multiple of ScaleGranularityK.");
|
||||
if(K % ck_tile::numeric_traits<ADataType>::PackedSize != 0 ||
|
||||
K % ck_tile::numeric_traits<BDataType>::PackedSize != 0)
|
||||
throw std::runtime_error("wrong! K must be multiple of packed size.");
|
||||
|
||||
ck_tile ::HostTensor<ADataType> a_host(
|
||||
ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(a_layout)));
|
||||
ck_tile::HostTensor<BDataType> b_origin_host(
|
||||
ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
|
||||
ck_tile::HostTensor<CDataType> c_rslt_host(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
|
||||
ck_tile::HostTensor<ScaleType> scale_a(ck_tile::host_tensor_descriptor(
|
||||
M / ScaleGranularityM, K / ScaleGranularityK, scale_stride_A, is_row_major(a_layout)));
|
||||
ck_tile::HostTensor<ScaleType> scale_b(ck_tile::host_tensor_descriptor(
|
||||
K / ScaleGranularityK, N / ScaleGranularityN, scale_stride_B, is_row_major(b_layout)));
|
||||
|
||||
if(init_method == 0)
|
||||
{
|
||||
ck_tile::FillUniformDistribution<ADataType>{0.0f, 1.0f}(a_host);
|
||||
ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{-2.f, 2.f}(scale_a);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{-2.f, 2.f}(scale_b);
|
||||
}
|
||||
else if(init_method == 1)
|
||||
{
|
||||
ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_host);
|
||||
ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_origin_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{1.f, 1.f}(scale_a);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{1.f, 1.f}(scale_b);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("wrong! Unexpected init_method");
|
||||
}
|
||||
|
||||
ck_tile::HostTensor<BDataType> b_shuffled_host(
|
||||
ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(b_layout)));
|
||||
preShuffleWeight<FlatmmConfig>(b_origin_host.begin(), b_shuffled_host.begin(), N, K);
|
||||
|
||||
const auto scale_a_shuffled = preShuffleScale<FlatmmConfig, true>(scale_a);
|
||||
const auto scale_b_shuffled = preShuffleScale<FlatmmConfig, false>(scale_b);
|
||||
|
||||
ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem b_shuffled_dev_buf(b_shuffled_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem c_dev_buf(c_rslt_host.get_element_space_size_in_bytes());
|
||||
|
||||
ck_tile::DeviceMem scale_a_dev_buf(scale_a_shuffled.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem scale_b_dev_buf(scale_b_shuffled.get_element_space_size_in_bytes());
|
||||
|
||||
a_dev_buf.ToDevice(a_host.data());
|
||||
b_shuffled_dev_buf.ToDevice(b_shuffled_host.data());
|
||||
c_rslt_host.SetZero();
|
||||
scale_a_dev_buf.ToDevice(scale_a_shuffled.data());
|
||||
scale_b_dev_buf.ToDevice(scale_b_shuffled.data());
|
||||
|
||||
auto scale_a_dev_ptr = ck_tile::FlatmmScalePointer<ScaleGranularityM, ScaleGranularityK>{
|
||||
static_cast<float*>(scale_a_dev_buf.GetDeviceBuffer()), M / ScaleGranularityM};
|
||||
auto scale_b_dev_ptr = ck_tile::FlatmmScalePointer<ScaleGranularityN, ScaleGranularityK>{
|
||||
static_cast<float*>(scale_b_dev_buf.GetDeviceBuffer()), N / ScaleGranularityN};
|
||||
|
||||
invoke_mx_flatmm<FlatmmConfig,
|
||||
ADataType,
|
||||
BDataType,
|
||||
ck_tile::tuple<>,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
ck_tile::tuple<>,
|
||||
CLayout,
|
||||
decltype(scale_a_dev_ptr),
|
||||
decltype(scale_b_dev_ptr),
|
||||
UsePersistentKernel>(a_dev_buf,
|
||||
b_shuffled_dev_buf,
|
||||
c_dev_buf,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
stride_A,
|
||||
stride_B,
|
||||
stride_C,
|
||||
kbatch,
|
||||
scale_a_dev_ptr,
|
||||
scale_b_dev_ptr,
|
||||
n_warmup,
|
||||
n_repeat);
|
||||
|
||||
c_dev_buf.FromDevice(c_rslt_host.data());
|
||||
|
||||
bool pass = true;
|
||||
if(arg_parser.get_int("v") == 1)
|
||||
{
|
||||
ck_tile::HostTensor<CDataType> c_m_n_host_ref(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
c_m_n_host_ref.SetZero();
|
||||
|
||||
ck_tile::reference_mx_gemm<ADataType, BDataType, ScaleType, AccDataType, CDataType>(
|
||||
a_host, b_origin_host, c_m_n_host_ref, scale_a, scale_b);
|
||||
|
||||
const float rtol = std::is_same_v<ADataType, ck_tile::half_t> ? 1e-3 : 1e-2;
|
||||
const float atol = std::is_same_v<ADataType, ck_tile::half_t> ? 1e-3 : 1e-2;
|
||||
|
||||
pass = ck_tile::check_err(
|
||||
c_rslt_host, c_m_n_host_ref, "Error: Incorrect results!", rtol, atol);
|
||||
|
||||
std::cout << "Relative error threshold: " << rtol << " Absolute error threshold: " << atol
|
||||
<< std::endl;
|
||||
std::cout << "The GPU veification result is: " << (pass ? "correct" : "fail") << std::endl;
|
||||
}
|
||||
|
||||
return pass;
|
||||
}
|
||||
Reference in New Issue
Block a user