adap gemm_mx_kernel.hpp from flatmm, comment changes needed to mx pipeline from flatmm

This commit is contained in:
Sami Remes
2025-12-18 04:06:04 -05:00
parent 292df2719f
commit 4985afb03c
6 changed files with 2566 additions and 0 deletions

View File

@@ -0,0 +1,381 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <iostream>
#include <string>
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
namespace ck_tile {
template <typename ScaleM = MXScalePointer<-1>, typename ScaleN = MXScalePointer<-1>, index_t NumATensor = 1, index_t NumBTensor = 1, index_t NumDTensor = 0>
struct MXGemmKernelArgs : UniversalGemmKernelArgs<NumATensor, NumBTensor, NumDTensor>
{
using Base = UniversalGemmKernelArgs<NumATensor, NumBTensor, NumDTensor>;
CK_TILE_HOST MXGemmKernelArgs(const std::array<const void*, NumATensor>& as_ptr_,
const std::array<const void*, NumBTensor>& bs_ptr_,
const std::array<const void*, NumDTensor>& ds_ptr_,
void* e_ptr_,
index_t k_batch_,
index_t M_,
index_t N_,
index_t K_,
const std::array<index_t, NumATensor>& stride_As_,
const std::array<index_t, NumBTensor>& stride_Bs_,
const std::array<index_t, NumDTensor>& stride_Ds_,
index_t stride_E_)
: Base(as_ptr_,
bs_ptr_,
ds_ptr_,
e_ptr_,
k_batch_,
M_,
N_,
K_,
stride_As_,
stride_Bs_,
stride_Ds_,
stride_E_)
{
}
};
template <typename TilePartitioner_, typename MXGemmPipeline_, typename EpiloguePipeline_>
struct MXGemmKernel : UniversalGemmKernel<TilePartitioner_, MXGemmPipeline_, EpiloguePipeline_>
{
using Underlying = UniversalGemmKernel<TilePartitioner_, MXGemmPipeline_, EpiloguePipeline_>;
using TilePartitioner = remove_cvref_t<TilePartitioner_>;
using MXGemmPipeline = remove_cvref_t<MXGemmPipeline_>;
using BlockGemmShape =
remove_cvref_t<typename MXGemmPipeline::BlockGemmShape>;
using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
using ALayout = remove_cvref_t<typename MXGemmPipeline::ALayout>;
using BLayout = remove_cvref_t<typename MXGemmPipeline::BLayout>;
using ELayout = remove_cvref_t<typename MXGemmPipeline::CLayout>;
using DsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
using DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
static constexpr index_t KernelBlockSize = MXGemmPipeline::BlockSize;
static constexpr bool UsePersistentKernel = MXGemmPipeline::UsePersistentKernel;
using ADataType = remove_cvref_t<typename MXGemmPipeline::ADataType>;
using BDataType = remove_cvref_t<typename MXGemmPipeline::BDataType>;
// Below type is actually accumulation data type - the output of block GEMM.
using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
static constexpr auto I0 = number<0>();
static constexpr auto I1 = number<1>();
static constexpr auto I2 = number<2>();
static constexpr auto I3 = number<3>();
static constexpr auto I4 = number<4>();
static constexpr auto I5 = number<5>();
static constexpr index_t NumATensor = typename Underlying::AsDataType::size();
static constexpr index_t NumBTensor = typename Underlying::BsDataType::size();
static constexpr index_t NumDTensor = typename Underlying::DsDataType::size();
using ADataType = remove_cvref_t<std::tuple_element_t<I0, AsDataType>>;
using BDataType = remove_cvref_t<std::tuple_element_t<I0, BsDataType>>;
static constexpr auto MThreadPerXdl = BlockGemmShape::WarpTile::at(number<0>{});
static constexpr auto NThreadPerXdl = BlockGemmShape::WarpTile::at(number<1>{});
static constexpr auto KThreadPerXdl = 64 / MThreadPerXdl;
static constexpr auto APackedSize = numeric_traits<ADataType>::PackedSize;
static constexpr auto BPackedSize = numeric_traits<BDataType>::PackedSize;
static constexpr auto MXdlPack = MXGemmPipeline::MXdlPack;
static constexpr auto NXdlPack = MXGemmPipeline::NXdlPack;
static constexpr auto KXdlPack = MXGemmPipeline::KXdlPack;
static_assert(DsLayout::size() == DsDataType::size(),
"The size of DsLayout and DsDataType should be the same");
[[nodiscard]] CK_TILE_HOST static const std::string GetName()
{
// clang-format off
return concat('_', "mx_gemm", gemm_prec_str<ADataType, BDataType>, MXGemmPipeline::GetName());
// clang-format on
}
template <typename ScaleM, typename ScaleN>
using KernelArgs = MXGemmKernelArgs<ScaleM, ScaleN, NumATensor, NumBTensor, NumDTensor>;
template <class ScaleM, class ScaleN>
CK_TILE_HOST static constexpr auto
GridSize(const KernelArgs<ScaleM, ScaleN>& kargs)
{
hipDeviceProp_t prop;
int deviceId = 0; // default device
int dync_smem_size = 0;
int maxActiveBlocksPerCU = 0;
if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
throw std::runtime_error(std::string("hipGetDeviceProperties failed: ") +
hipGetErrorName(hipGetLastError()));
if(hipOccupancyMaxActiveBlocksPerMultiprocessor(
&maxActiveBlocksPerCU,
reinterpret_cast<void*>(
kentry<1, MXGemmKernel, remove_cvref_t<decltype(kargs)>>),
KernelBlockSize,
dync_smem_size) != hipSuccess)
throw std::runtime_error(
std::string("hipOccupancyMaxActiveBlocksPerMultiprocessor failed: ") +
hipGetErrorName(hipGetLastError()));
const int persistent_block_size = prop.multiProcessorCount * maxActiveBlocksPerCU;
const int total_work_tile_cnt = TilePartitioner::GridSize(kargs.M, kargs.N);
return dim3(min(persistent_block_size, total_work_tile_cnt), 1, 1);
}
using SplitKBatchOffset = typename Underlying::SplitKBatchOffset;
template <memory_operation_enum DstInMemOp = memory_operation_enum::set, typename ScaleM, typename ScaleN>
CK_TILE_DEVICE static auto
MakeGemmTensorViews(const std::array<const ADataType*, NumATensor>& as_ptr,
const std::array<const BDataType*, NumBTensor>& bs_ptr,
const std::array<const void*, NumDTensor>& ds_ptr,
EDataType* e_ptr,
const KernelArgs<ScaleM, ScaleN>& kargs,
const index_t k_size)
{
// Get tensor views from the UniversalGemmKernel
const auto& gemm_tensor_views_tuple =
Underlying::template MakeGemmTensorViews<DstInMemOp>(
as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, k_size);
auto scale_a = kargs.scale_m_ptr;
auto scale_b = kargs.scale_n_ptr;
static constexpr int BlockScaleSize = 32; // decltype(scale_n)::GranularityK;
const auto&& scale_packs_m = integer_divide_ceil(kargs.M, (MXdlPack * MThreadPerXdl));
const auto&& scale_packs_n = integer_divide_ceil(kargs.N, (NXdlPack * NThreadPerXdl));
const auto&& scale_packs_k = kargs.K / BlockScaleSize / (KXdlPack * KThreadPerXdl);
// A scale tensor view
const auto& scale_a_tensor_view = [&]() {
// Pack 2x2 e8m0 over M/K dimension into 1 int32_t to trigger dword width load
const auto scale_a_naive_desc = make_naive_tensor_descriptor_packed(
make_tuple(scale_packs_m, scale_packs_k, KThreadPerXdl, MThreadPerXdl));
const auto scale_a_desc = transform_tensor_descriptor(
scale_a_naive_desc,
make_tuple(make_merge_transform(make_tuple(scale_packs_m, MThreadPerXdl)),
make_merge_transform(make_tuple(scale_packs_k, KThreadPerXdl))),
make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
return make_tensor_view<address_space_enum::global>(
reinterpret_cast<const int32_t*>(scale_a.ptr), scale_a_desc);
}();
// B scale tensor view
const auto& scale_b_tensor_view = [&]() {
const auto scale_b_navie_desc = make_naive_tensor_descriptor_packed(
make_tuple(scale_packs_n, scale_packs_k, KThreadPerXdl, NThreadPerXdl));
const auto scale_b_desc = transform_tensor_descriptor(
scale_b_navie_desc,
make_tuple(make_merge_transform(make_tuple(scale_packs_n, NThreadPerXdl)),
make_merge_transform(make_tuple(scale_packs_k, KThreadPerXdl))),
make_tuple(sequence<0, 3>{}, sequence<1, 2>{}),
make_tuple(sequence<0>{}, sequence<1>{}));
return make_tensor_view<address_space_enum::global>(
reinterpret_cast<const int32_t*>(scale_b.ptr), scale_b_desc);
}();
return concat_tuple(gemm_tensor_views_tuple, make_tuple(scale_a_tensor_view, scale_b_tensor_view));
}
template <typename TensorView>
CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
{
const auto& padded_views = Underlying::template MakeGemmPadViews(views);
return make_tuple(
padded_views.at(I0), padded_views.at(I1), padded_views.at(I2), padded_views.at(I3), views.at(I4), views.at(I5));
}
template <typename PadView>
CK_TILE_DEVICE static auto
MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
{
const auto& tile_windows = Underlying::template MakeGemmTileWindows(views, i_m, i_n);
static constexpr int BlockScaleSize = 32;
auto scale_a_block_window = make_tile_window(
views.at(I4),
make_tuple(number<TilePartitioner::MPerBlock / MXdlPack>{},
number<TilePartitioner::KPerBlock / (BlockScaleSize * KXdlPack)>{}),
{i_m / MXdlPack, 0});
auto scale_b_block_window = make_tile_window(
views.at(I5),
make_tuple(number<TilePartitioner::NPerBlock / NXdlPack>{},
number<TilePartitioner::KPerBlock / (BlockScaleSize * KXdlPack)>{}),
{i_n / NXdlPack, 0});
return make_tuple(tile_windows.at(I0),
tile_windows.at(I1),
tile_windows.at(I2),
tile_windows.at(I3),
scale_a_block_window,
scale_b_block_window);
}
template <class ScaleM, class ScaleN, bool UseDefaultScheduler = true>
CK_TILE_DEVICE static void
RunMxGemm(const ADataType* a_ptr,
const BDataType* b_ptr,
const std::array<const void*, NumDTensor>& ds_ptr,
EDataType* e_ptr,
void* smem_ptr_ping,
void* smem_ptr_pong,
const KernelArgs<ScaleM, ScaleN>& kargs,
const SplitKBatchOffset& splitk_batch_offset,
const index_t block_idx_m,
const index_t block_idx_n)
{
// Create Gemm tensor views, pad views and tile windows
const auto& gemm_tensor_views_tuple =
MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
a_ptr, b_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
// Run GEMM cooperatively by whole workgroup.
const auto& a_block_window = gemm_tile_windows.at(I0);
const auto& b_flat_block_window = gemm_tile_windows.at(I1);
const auto& d_block_window = gemm_tile_windows.at(I2);
const auto& scale_a_block_window = gemm_tile_windows.at(I4);
const auto& scale_b_block_window = gemm_tile_windows.at(I5);
static_assert(ScaleM::GranularityK == ScaleN::GranularityK // have the same granK
|| ScaleM::GranularityMN == -1 // or ScaleA is disable
|| ScaleN::GranularityMN == -1, // or ScaleB is disable
"ScaleM and ScaleN should have the same GranularityK");
constexpr bool DoEpiScale =
(ScaleM::GranularityMN != -1 && ScaleM::GranularityK == 0) || // per token
(ScaleN::GranularityMN != -1 && ScaleN::GranularityK == 0); // per channel
const auto& c_block_tile = MXFlatmmPipeline{}(a_block_window,
b_flat_block_window,
scale_a_block_window,
scale_b_block_window,
num_loop,
smem_ptr_ping,
smem_ptr_pong);
// Run Epilogue Pipeline
if constexpr(DoEpiScale)
{
auto& c_block_window = gemm_tile_windows.at(I3);
EpiloguePipeline{}(c_block_window,
c_block_tile,
d_block_window,
smem_ptr_ping,
kargs.scale_m_ptr + block_idx_m,
kargs.scale_n_ptr + block_idx_n);
}
else if(UseDefaultScheduler || (get_warp_id() == 0))
{
// Run Epilogue Pipeline
auto& c_block_window = gemm_tile_windows.at(I3);
EpiloguePipeline{}(c_block_window, c_block_tile, d_block_window, smem_ptr_ping);
}
}
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemPingSize()
{
return max(MXGemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
}
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemPongSize()
{
return MXGemmPipeline::GetSmemSize();
}
template <class ScaleM, class ScaleN>
CK_TILE_DEVICE void operator()(KernelArgs<ScaleM, ScaleN> kargs,
int partition_idx = get_block_id()) const
{
const int total_work_tile_cnt = amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N));
do
{
const auto [iM, iN] =
TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(partition_idx);
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
const SplitKBatchOffset splitk_batch_offset(kargs);
// options
const auto a_ptr = static_cast<const ADataType*>(kargs.as_ptr) +
splitk_batch_offset.a_k_split_offset / APackedSize;
const auto b_ptr = static_cast<const BDataType*>(kargs.b_ptr) +
splitk_batch_offset.b_k_split_offset / BPackedSize;
EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
// options
std::array<const ADataType*, NumATensor> as_ptr;
static_for<0, NumATensor, 1>{}([&](auto i) {
as_ptr[i] = static_cast<const ADataType*>(kargs.as_ptr[i]) +
splitk_batch_offset.as_k_split_offset[i] / APackedSize;
});
std::array<const BDataType*, NumBTensor> bs_ptr;
static_for<0, NumBTensor, 1>{}([&](auto i) {
bs_ptr[i] = static_cast<const BDataType*>(kargs.bs_ptr[i]) +
splitk_batch_offset.bs_k_split_offset[i] / BPackedSize;
});
// Calculate output offset from tile partitioner and apply to output pointer
EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
if constexpr(has_tile_partitioner_output_offset)
{
const index_t output_offset = TilePartitioner::GetOutputOffset(kargs, blockIdx.z);
e_ptr += output_offset;
}
// allocate LDS
__shared__ char smem_ptr_ping[GetSmemPingSize()];
__shared__ char smem_ptr_pong[GetSmemPongSize()];
if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
is_any_of<EDataType, fp16_t, bf16_t>::value))
{
constexpr auto scheduler_type = (MXGemmPipeline::NumWaveGroups == 1);
RunMxGemm<ScaleM, ScaleN, scheduler_type>(as_ptr,
bs_ptr,
kargs.ds_ptr,
e_ptr,
smem_ptr_ping,
smem_ptr_pong,
kargs,
splitk_batch_offset,
i_m,
i_n);
}
else
{
static_assert(false,
"Unimplemented: atomic_add with odd vector size for fp16/bf16");
}
partition_idx += gridDim.x;
} while(UsePersistentKernel && partition_idx < total_work_tile_cnt);
}
};
} // namespace ck_tile

View File

@@ -0,0 +1,110 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
namespace ck_tile {
template <int SharedGranularityMN, int SharedGranularityK = 0>
struct MXScalePointer
{
static constexpr int GranularityMN = SharedGranularityMN;
static constexpr int GranularityK = SharedGranularityK;
const float* ptr;
CK_TILE_HOST_DEVICE MXScalePointer() = default;
CK_TILE_HOST_DEVICE MXScalePointer(const float* ptr_) : ptr(ptr_) {}
CK_TILE_HOST_DEVICE MXScalePointer(const float* ptr_, [[maybe_unused]] index_t length_)
: ptr(ptr_)
{
}
CK_TILE_HOST_DEVICE MXScalePointer operator+(index_t offset) const
{
MXScalePointer ret;
if constexpr(GranularityMN == 0)
{
ret.ptr = ptr + offset / GranularityK;
}
else
{
ret.ptr = ptr + offset / GranularityMN / GranularityK;
}
return ret;
}
CK_TILE_HOST_DEVICE float operator[](index_t i) const = delete;
};
template <int SharedGranularityMN>
struct MXScalePointer<SharedGranularityMN, 0>
{
static constexpr int GranularityMN = SharedGranularityMN;
static constexpr int GranularityK = 0;
static_assert(GranularityMN != 0);
const float* ptr;
index_t length;
CK_TILE_HOST_DEVICE MXScalePointer() = default;
CK_TILE_HOST_DEVICE MXScalePointer(const float* ptr_) : ptr(ptr_), length(1) {}
CK_TILE_HOST_DEVICE MXScalePointer(const float* ptr_, index_t length_)
: ptr(ptr_), length(length_)
{
}
CK_TILE_HOST_DEVICE MXScalePointer operator+(index_t offset) const
{
MXScalePointer ret;
if constexpr(GranularityMN == 1)
{
ret.ptr = ptr + offset;
ret.length = length - offset;
}
else
{
ret.ptr = ptr + offset / GranularityMN;
ret.length = length - offset / GranularityMN;
}
return ret;
}
CK_TILE_HOST_DEVICE float operator[](index_t i) const
{
// with additional oob check
if constexpr(GranularityMN == 1)
return i < length ? ptr[i] : 0;
else
return i / GranularityMN < length ? ptr[i / GranularityMN] : 0;
}
};
// shared granularityMN = -1 means no scale
template <>
struct MXScalePointer<-1, 0>
{
static constexpr int GranularityMN = -1;
static constexpr int GranularityK = 0;
const float* ptr = nullptr;
CK_TILE_HOST_DEVICE constexpr MXScalePointer() = default;
CK_TILE_HOST_DEVICE constexpr MXScalePointer(const float*) {}
CK_TILE_HOST_DEVICE constexpr MXScalePointer(const float*, index_t) {}
CK_TILE_HOST_DEVICE constexpr MXScalePointer operator+(index_t) const
{
return MXScalePointer{};
}
CK_TILE_HOST_DEVICE constexpr float operator[](index_t) const
{
return 1; // alway return 1, it doesn't change the result
}
};
} // namespace ck_tile