mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
[CK_TILE] MX GEMM non-preshuffled RCR layout (#4594)
## Motivation Implements a GEMM with MX scaling for fp4 and fp8 in non-preshuffled layouts using async pipeline. ## Technical Details <!-- Explain the changes along with any relevant GitHub links. --> ## Test Plan <!-- Explain any relevant testing done to verify this PR. --> ## Test Result <!-- Briefly summarize test outcomes. --> ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: ThomasNing <thomas.ning@amd.com>
This commit is contained in:
@@ -7,6 +7,7 @@
|
||||
#include <variant>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/core/numeric/pk_fp4.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
@@ -241,6 +242,27 @@ struct GemmConfigComputeV6 : public GemmConfigBase
|
||||
static constexpr ck_tile::index_t NumWaveGroups = 1;
|
||||
};
|
||||
|
||||
template <typename PrecType>
|
||||
struct GemmConfigComputeAsync : public GemmConfigBase
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 64;
|
||||
static constexpr ck_tile::index_t N_Tile = 64;
|
||||
static constexpr ck_tile::index_t K_Tile = 256;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp = 1;
|
||||
static constexpr ck_tile::index_t N_Warp = 4;
|
||||
static constexpr ck_tile::index_t K_Warp = 1;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t N_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t K_Warp_Tile = 128;
|
||||
|
||||
static constexpr bool DoubleSmemBuffer = true;
|
||||
static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_ASYNC;
|
||||
static constexpr ck_tile::index_t NumWaveGroups = 1;
|
||||
static constexpr bool UseStructuredSparsity = false;
|
||||
};
|
||||
|
||||
template <typename PrecType>
|
||||
struct GemmConfigPreshuffleDecode : public GemmConfigBase
|
||||
{
|
||||
@@ -375,6 +397,15 @@ struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
|
||||
using CDataType = int32_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct GemmTypeConfig<ck_tile::pk_fp4_t, ck_tile::pk_fp4_t, ck_tile::half_t>
|
||||
{
|
||||
using ADataType = ck_tile::pk_fp4_t;
|
||||
using BDataType = ck_tile::pk_fp4_t;
|
||||
using AccDataType = float;
|
||||
using CDataType = ck_tile::half_t;
|
||||
};
|
||||
|
||||
template <ck_tile::GemmPipeline PipelineId>
|
||||
struct PipelineTypeTraits;
|
||||
|
||||
@@ -423,6 +454,15 @@ struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_V6>
|
||||
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV6<PipelineProblem>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PipelineTypeTraits<ck_tile::GemmPipeline::COMPUTE_ASYNC>
|
||||
{
|
||||
template <typename PipelineProblem>
|
||||
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompAsync<PipelineProblem>;
|
||||
template <typename PipelineProblem>
|
||||
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompAsync<PipelineProblem>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PipelineTypeTraits<ck_tile::GemmPipeline::PRESHUFFLE_V2>
|
||||
{
|
||||
|
||||
@@ -30,7 +30,6 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
|
||||
ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
|
||||
const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
|
||||
max_accumulated_value, kbatch);
|
||||
// Use higher threshold
|
||||
return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
|
||||
}
|
||||
|
||||
@@ -369,7 +368,9 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K;
|
||||
std::size_t num_byte =
|
||||
sizeof(ADataType) * M * K + sizeof(BDataType) * N * K + sizeof(CDataType) * M * N;
|
||||
sizeof(ADataType) * M * K / ck_tile::numeric_traits<ADataType>::PackedSize +
|
||||
sizeof(BDataType) * N * K / ck_tile::numeric_traits<BDataType>::PackedSize +
|
||||
sizeof(CDataType) * M * N;
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
|
||||
@@ -262,6 +262,23 @@ int run_gemm_example(ck_tile::ArgParser& arg_parser)
|
||||
throw std::runtime_error("Unsupported pipeline for this operation !!!");
|
||||
}
|
||||
}
|
||||
if(data_type == "fp4")
|
||||
{
|
||||
if constexpr(GemmConfig<ck_tile::pk_fp4_t>::Pipeline ==
|
||||
ck_tile::GemmPipeline::COMPUTE_ASYNC &&
|
||||
GemmConfig<ck_tile::pk_fp4_t>::K_Warp_Tile == 128)
|
||||
{
|
||||
return run_gemm_example_prec_type_universal<GemmConfig<ck_tile::pk_fp4_t>,
|
||||
ck_tile::pk_fp4_t,
|
||||
ck_tile::pk_fp4_t,
|
||||
ck_tile::half_t>(
|
||||
a_layout, b_layout, arg_parser);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported pipeline for this operation !!!");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Unsupported data type for this operation !!!");
|
||||
|
||||
21
example/ck_tile/42_mx_gemm/CMakeLists.txt
Normal file
21
example/ck_tile/42_mx_gemm/CMakeLists.txt
Normal file
@@ -0,0 +1,21 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
set(SUPPORTED_GPUS gfx950)
|
||||
|
||||
set(has_supported_gpu FALSE)
|
||||
foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST SUPPORTED_GPUS)
|
||||
set(has_supported_gpu TRUE)
|
||||
break()
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
if(has_supported_gpu)
|
||||
add_executable(tile_example_mx_gemm mx_gemm.cpp)
|
||||
set(EXAMPLE_MX_GEMM_COMPILE_OPTIONS -Wno-undefined-func-template)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND EXAMPLE_MX_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
endif()
|
||||
target_compile_options(tile_example_mx_gemm PRIVATE ${EXAMPLE_MX_GEMM_COMPILE_OPTIONS})
|
||||
endif()
|
||||
128
example/ck_tile/42_mx_gemm/mx_gemm.cpp
Normal file
128
example/ck_tile/42_mx_gemm/mx_gemm.cpp
Normal file
@@ -0,0 +1,128 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include <hip/hip_runtime.h>
|
||||
|
||||
#include <cstring>
|
||||
#include <iostream>
|
||||
#include <ostream>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "mx_gemm.hpp"
|
||||
#include "mx_gemm_instance.hpp"
|
||||
|
||||
template <typename Layout>
|
||||
static constexpr inline auto is_row_major(Layout layout_)
|
||||
{
|
||||
return ck_tile::bool_constant<std::is_same_v<ck_tile::remove_cvref_t<decltype(layout_)>,
|
||||
ck_tile::tensor_layout::gemm::RowMajor>>{};
|
||||
}
|
||||
|
||||
template <typename GemmConfig,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout,
|
||||
typename ScaleM,
|
||||
typename ScaleN,
|
||||
bool UsePersistentKernel = false>
|
||||
float invoke_mx_gemm(ck_tile::DeviceMem& a_dev_buf,
|
||||
ck_tile::DeviceMem& b_dev_buf,
|
||||
ck_tile::DeviceMem& c_dev_buf,
|
||||
ck_tile::index_t M,
|
||||
ck_tile::index_t N,
|
||||
ck_tile::index_t K,
|
||||
ck_tile::index_t stride_A,
|
||||
ck_tile::index_t stride_B,
|
||||
ck_tile::index_t stride_C,
|
||||
ck_tile::index_t kbatch,
|
||||
ScaleM scale_m,
|
||||
ScaleN scale_n,
|
||||
int n_warmup,
|
||||
int n_repeat)
|
||||
{
|
||||
MXGemmHostArgs<ScaleM, ScaleN> args(a_dev_buf.GetDeviceBuffer(),
|
||||
b_dev_buf.GetDeviceBuffer(),
|
||||
c_dev_buf.GetDeviceBuffer(),
|
||||
kbatch,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
stride_A,
|
||||
stride_B,
|
||||
stride_C,
|
||||
scale_m,
|
||||
scale_n);
|
||||
|
||||
// Simplified invocation - comp_async handles hot loop and tail internally
|
||||
auto invoke_splitk_path = [&](auto split_k_) {
|
||||
return mx_gemm_calc<GemmConfig,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
ScaleM,
|
||||
ScaleN,
|
||||
UsePersistentKernel,
|
||||
split_k_.value>(
|
||||
args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
|
||||
};
|
||||
|
||||
float ave_time = (args.k_batch == 1) ? invoke_splitk_path(std::false_type{})
|
||||
: invoke_splitk_path(std::true_type{});
|
||||
|
||||
constexpr int APackedSize = ck_tile::numeric_traits<ADataType>::PackedSize;
|
||||
constexpr int BPackedSize = ck_tile::numeric_traits<BDataType>::PackedSize;
|
||||
|
||||
std::size_t flop = std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / 32;
|
||||
std::size_t num_byte = sizeof(ADataType) * M * K / APackedSize +
|
||||
sizeof(BDataType) * N * K / BPackedSize + sizeof(CDataType) * M * N +
|
||||
sizeof(ck_tile::e8m0_t) * M * K / 32 +
|
||||
sizeof(ck_tile::e8m0_t) * N * K / 32;
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
|
||||
float gb_per_sec = num_byte / 1.E6 / ave_time;
|
||||
|
||||
std::cout << "Run " << ck_tile::gemm_prec_str<ADataType, BDataType>() << " MX GEMM kernel " //
|
||||
<< " M = " << M << " N = " << N << " K = " << K << " StrideA = " << stride_A
|
||||
<< " StrideB = " << stride_B << " StrideC = " << stride_C << " : " << ave_time
|
||||
<< " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
|
||||
|
||||
return ave_time;
|
||||
}
|
||||
|
||||
auto create_args(int argc, char* argv[])
|
||||
{
|
||||
ck_tile::ArgParser arg_parser;
|
||||
arg_parser.insert("m", "4096", "m dimension")
|
||||
.insert("n", "4096", "n dimension")
|
||||
.insert("k", "4096", "k dimension")
|
||||
.insert("a_layout", "R", "A tensor data layout - Row by default")
|
||||
.insert("b_layout", "C", "B tensor data layout - Row by default")
|
||||
.insert("c_layout", "R", "C tensor data layout - Row by default")
|
||||
.insert("stride_a", "0", "Tensor A stride")
|
||||
.insert("stride_b", "0", "Tensor B stride")
|
||||
.insert("stride_c", "0", "Tensor C stride")
|
||||
.insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
|
||||
.insert(
|
||||
"mx_prec", "fp4xfp4", "data type for activation and weight, support: fp4xfp4, fp8xfp8")
|
||||
.insert("warmup", "50", "number of iterations before benchmark the kernel")
|
||||
.insert("repeat", "100", "number of iterations to benchmark the kernel")
|
||||
.insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
|
||||
.insert("split_k", "1", "splitK value")
|
||||
.insert("init", "0", "0:random, 1:constant(1)");
|
||||
bool result = arg_parser.parse(argc, argv);
|
||||
return std::make_tuple(result, arg_parser);
|
||||
}
|
||||
|
||||
#include "run_mx_gemm.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return run_mx_gemm_example(argc, argv); }
|
||||
99
example/ck_tile/42_mx_gemm/mx_gemm.hpp
Normal file
99
example/ck_tile/42_mx_gemm/mx_gemm.hpp
Normal file
@@ -0,0 +1,99 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp"
|
||||
|
||||
template <typename ScaleM, typename ScaleN>
|
||||
struct MXGemmHostArgs : ck_tile::UniversalGemmHostArgs<1, 1, 0>
|
||||
{
|
||||
using Base = ck_tile::UniversalGemmHostArgs<1, 1, 0>;
|
||||
|
||||
MXGemmHostArgs(const void* a_ptr,
|
||||
const void* b_ptr,
|
||||
void* c_ptr_,
|
||||
ck_tile::index_t k_batch_,
|
||||
ck_tile::index_t M_,
|
||||
ck_tile::index_t N_,
|
||||
ck_tile::index_t K_,
|
||||
ck_tile::index_t stride_A_,
|
||||
ck_tile::index_t stride_B_,
|
||||
ck_tile::index_t stride_C_,
|
||||
ScaleM scale_m_,
|
||||
ScaleN scale_n_)
|
||||
: Base({a_ptr},
|
||||
{b_ptr},
|
||||
{},
|
||||
c_ptr_,
|
||||
k_batch_,
|
||||
M_,
|
||||
N_,
|
||||
K_,
|
||||
{stride_A_},
|
||||
{stride_B_},
|
||||
{},
|
||||
stride_C_),
|
||||
scale_m(scale_m_),
|
||||
scale_n(scale_n_)
|
||||
{
|
||||
}
|
||||
|
||||
ScaleM scale_m;
|
||||
ScaleN scale_n;
|
||||
};
|
||||
|
||||
// GEMM config with 16x16 warp tile
|
||||
|
||||
struct MxGemmConfig
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 128;
|
||||
static constexpr ck_tile::index_t N_Tile = 128;
|
||||
static constexpr ck_tile::index_t K_Tile = 512;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp = 1;
|
||||
static constexpr ck_tile::index_t N_Warp = 4;
|
||||
static constexpr ck_tile::index_t K_Warp = 1;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t N_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t K_Warp_Tile = 128;
|
||||
|
||||
static constexpr bool kPadM = false;
|
||||
static constexpr bool kPadN = false;
|
||||
static constexpr bool kPadK = false;
|
||||
|
||||
static constexpr bool TransposeC = false;
|
||||
static constexpr bool UseStructuredSparsity = false;
|
||||
|
||||
static constexpr int kBlockPerCu = 1;
|
||||
static constexpr int TileParitionerGroupNum = 8;
|
||||
static constexpr int TileParitionerM01 = 4;
|
||||
static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave;
|
||||
static constexpr ck_tile::index_t NumWaveGroups = 1;
|
||||
static constexpr bool DoubleSmemBuffer = false; // comp_async uses double buffer
|
||||
static constexpr bool Preshuffle = false;
|
||||
|
||||
static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp;
|
||||
static constexpr bool TiledMMAPermuteN = false;
|
||||
};
|
||||
struct MXfp4_GemmConfig16 : MxGemmConfig
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 64;
|
||||
static constexpr ck_tile::index_t N_Tile = 64;
|
||||
static constexpr ck_tile::index_t K_Tile = 256;
|
||||
};
|
||||
|
||||
// GEMM config with 16x16 warp tile
|
||||
struct MXfp8_GemmConfig16 : MxGemmConfig
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 64;
|
||||
static constexpr ck_tile::index_t N_Tile = 64;
|
||||
static constexpr ck_tile::index_t K_Tile = 256;
|
||||
};
|
||||
106
example/ck_tile/42_mx_gemm/mx_gemm_instance.hpp
Normal file
106
example/ck_tile/42_mx_gemm/mx_gemm_instance.hpp
Normal file
@@ -0,0 +1,106 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "mx_gemm.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/kernel/gemm_mx_kernel.hpp"
|
||||
|
||||
template <typename Layout>
|
||||
using is_row_major_t = ck_tile::bool_constant<
|
||||
std::is_same_v<ck_tile::remove_cvref_t<Layout>, ck_tile::tensor_layout::gemm::RowMajor>>;
|
||||
|
||||
template <typename GemmConfig,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout,
|
||||
typename ScaleM,
|
||||
typename ScaleN,
|
||||
bool persistent,
|
||||
bool Splitk>
|
||||
float mx_gemm_calc(const MXGemmHostArgs<ScaleM, ScaleN>& args, const ck_tile::stream_config& s)
|
||||
{
|
||||
using GemmShape = ck_tile::TileGemmShape<
|
||||
ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
|
||||
ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
|
||||
ck_tile::
|
||||
sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
|
||||
|
||||
using MXGemmTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
|
||||
GemmConfig::kPadN,
|
||||
GemmConfig::kPadK,
|
||||
GemmConfig::DoubleSmemBuffer,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
GemmConfig::TransposeC,
|
||||
GemmConfig::UseStructuredSparsity,
|
||||
persistent,
|
||||
GemmConfig::NumWaveGroups,
|
||||
GemmConfig::Preshuffle>;
|
||||
|
||||
using ComputeDataType = ADataType;
|
||||
static_assert(sizeof(ComputeDataType) >= sizeof(BDataType),
|
||||
"mixed_prec_gemm requires ADataType is a wider type than BDataType");
|
||||
|
||||
using MXPipelineProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
MXGemmTraits,
|
||||
GemmConfig::Scheduler>;
|
||||
|
||||
// Use the new MX comp_async pipeline with MX scaling support
|
||||
using MXGemmPipeline = ck_tile::MXGemmPipelineAgBgCrCompAsync<MXPipelineProblem>;
|
||||
|
||||
using TilePartitioner =
|
||||
ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
|
||||
GemmConfig::TileParitionerGroupNum,
|
||||
GemmConfig::TileParitionerM01>;
|
||||
|
||||
using GemmEpilogue = ck_tile::CShuffleEpilogue<
|
||||
ck_tile::CShuffleEpilogueProblem<ComputeDataType,
|
||||
ComputeDataType,
|
||||
ck_tile::tuple<>, // DsDataType
|
||||
AccDataType,
|
||||
CDataType,
|
||||
ck_tile::tuple<>, // DsLayout
|
||||
CLayout,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
TilePartitioner::MPerBlock,
|
||||
TilePartitioner::NPerBlock,
|
||||
GemmConfig::M_Warp,
|
||||
GemmConfig::N_Warp,
|
||||
GemmConfig::M_Warp_Tile,
|
||||
GemmConfig::N_Warp_Tile,
|
||||
GemmConfig::K_Warp_Tile,
|
||||
MXPipelineProblem::TransposeC>>;
|
||||
|
||||
using Kernel = ck_tile::MXGemmKernel<TilePartitioner, MXGemmPipeline, GemmEpilogue>;
|
||||
|
||||
auto kargs = Kernel::MakeKernelArgs(std::array<const void*, 1>{args.as_ptr},
|
||||
std::array<const void*, 1>{args.bs_ptr},
|
||||
std::array<const void*, 0>{},
|
||||
args.e_ptr,
|
||||
args.k_batch,
|
||||
args.M,
|
||||
args.N,
|
||||
args.K,
|
||||
std::array<ck_tile::index_t, 1>{args.stride_As},
|
||||
std::array<ck_tile::index_t, 1>{args.stride_Bs},
|
||||
std::array<ck_tile::index_t, 0>{},
|
||||
args.stride_E,
|
||||
args.scale_m,
|
||||
args.scale_n);
|
||||
|
||||
const auto kernel = ck_tile::make_kernel<Kernel::kBlockPerCu>(
|
||||
Kernel{}, Kernel::GridSize(kargs), Kernel::BlockSize(), 0, kargs);
|
||||
|
||||
return ck_tile::launch_kernel(s, kernel);
|
||||
}
|
||||
220
example/ck_tile/42_mx_gemm/run_mx_gemm.inc
Normal file
220
example/ck_tile/42_mx_gemm/run_mx_gemm.inc
Normal file
@@ -0,0 +1,220 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
// Calculate relative and absolute error thresholds for MX GEMM
|
||||
template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
|
||||
auto calculate_rtol_atol(const ck_tile::index_t K, const float max_accumulated_value)
|
||||
{
|
||||
using ComputeType =
|
||||
std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
|
||||
// Calculate thresholds
|
||||
const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(K);
|
||||
const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
|
||||
max_accumulated_value, K);
|
||||
return ck_tile::make_tuple(rtol, atol);
|
||||
}
|
||||
|
||||
// Use e8m0_t directly without packing - simpler and cleaner approach
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename GemmConfig,
|
||||
bool UsePersistentKernel,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout>
|
||||
int run_mx_gemm_with_layouts(int argc, char* argv[], ALayout, BLayout, CLayout)
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
ck_tile::index_t M = arg_parser.get_int("m");
|
||||
ck_tile::index_t N = arg_parser.get_int("n");
|
||||
ck_tile::index_t K = arg_parser.get_int("k");
|
||||
ck_tile::index_t stride_A = arg_parser.get_int("stride_a");
|
||||
ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
|
||||
ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
|
||||
int validation = arg_parser.get_int("v");
|
||||
int n_warmup = arg_parser.get_int("warmup");
|
||||
int n_repeat = arg_parser.get_int("repeat");
|
||||
int kbatch = arg_parser.get_int("split_k");
|
||||
int init_method = arg_parser.get_int("init");
|
||||
|
||||
using CDataType = ck_tile::fp16_t;
|
||||
|
||||
// Use get_default_stride helper for automatic leading dimension calculation (only if not
|
||||
// explicitly provided)
|
||||
if(stride_A == 0)
|
||||
stride_A = ck_tile::get_default_stride(M, K, 0, is_row_major(ALayout{}));
|
||||
if(stride_B == 0)
|
||||
stride_B = ck_tile::get_default_stride(K, N, 0, is_row_major(BLayout{}));
|
||||
if(stride_C == 0)
|
||||
stride_C = ck_tile::get_default_stride(M, N, 0, is_row_major(CLayout{}));
|
||||
|
||||
ck_tile::HostTensor<ADataType> a_host(
|
||||
ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<BDataType> b_host(
|
||||
ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(BLayout{})));
|
||||
ck_tile::HostTensor<CDataType> c_host(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
|
||||
// Scale tensors - follow parent matrix layouts for optimal memory access
|
||||
// A scales: [M, K/32] with A's layout
|
||||
// B scales: [K/32, N] with B's layout
|
||||
using ScaleType = ck_tile::e8m0_t;
|
||||
ck_tile::index_t scale_k_size = K / 32;
|
||||
|
||||
// Follow A/BLayout to get the layouts for the scale tensors
|
||||
ck_tile::index_t stride_scale_a =
|
||||
ck_tile::get_default_stride(M, scale_k_size, 0, is_row_major(ALayout{}));
|
||||
ck_tile::index_t stride_scale_b =
|
||||
ck_tile::get_default_stride(scale_k_size, N, 0, is_row_major(BLayout{}));
|
||||
|
||||
ck_tile::HostTensor<ScaleType> scale_a_host(
|
||||
ck_tile::host_tensor_descriptor(M, scale_k_size, stride_scale_a, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<ScaleType> scale_b_host(
|
||||
ck_tile::host_tensor_descriptor(scale_k_size, N, stride_scale_b, is_row_major(BLayout{})));
|
||||
int seed = 1234;
|
||||
switch(init_method)
|
||||
{
|
||||
case 0:
|
||||
// Initialize A, B, and scales to random values
|
||||
ck_tile::FillUniformDistribution<ADataType>{-2.f, 2.f, seed++}(a_host);
|
||||
ck_tile::FillUniformDistribution<BDataType>{-2.f, 2.f, seed++}(b_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{0.001f, 10.f, seed++}(scale_a_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{0.001f, 10.f, seed++}(scale_b_host);
|
||||
break;
|
||||
case 1:
|
||||
// Initialize A, B, and scales to 1.0
|
||||
ck_tile::FillConstant<ADataType>{ADataType(1.f)}(a_host);
|
||||
ck_tile::FillConstant<BDataType>{BDataType(1.f)}(b_host);
|
||||
ck_tile::FillConstant<ScaleType>{ScaleType(1.f)}(scale_a_host);
|
||||
ck_tile::FillConstant<ScaleType>{ScaleType(1.f)}(scale_b_host);
|
||||
break;
|
||||
case 2:
|
||||
// Initialize A and B with random values but with constant 1.0 scales
|
||||
ck_tile::FillUniformDistribution<ADataType>{-2.f, 2.f, seed++}(a_host);
|
||||
ck_tile::FillUniformDistribution<BDataType>{-2.f, 2.f, seed++}(b_host);
|
||||
ck_tile::FillConstant<ScaleType>{ScaleType(0.1f)}(scale_a_host);
|
||||
ck_tile::FillConstant<ScaleType>{ScaleType(0.1f)}(scale_b_host);
|
||||
break;
|
||||
}
|
||||
|
||||
// Device buffers for A, B, C, and scale tensors
|
||||
ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem b_dev_buf(b_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem c_dev_buf(c_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem scale_a_dev_buf(scale_a_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem scale_b_dev_buf(scale_b_host.get_element_space_size_in_bytes());
|
||||
|
||||
a_dev_buf.ToDevice(a_host.data());
|
||||
b_dev_buf.ToDevice(b_host.data());
|
||||
c_dev_buf.SetZero(); // Initialize C buffer to zero
|
||||
scale_a_dev_buf.ToDevice(scale_a_host.data());
|
||||
scale_b_dev_buf.ToDevice(scale_b_host.data());
|
||||
|
||||
// Scale pointers - use e8m0_t* directly
|
||||
using ScaleM = ck_tile::MXScalePointer<ScaleType, 1, 32>; // in blocks of 32 in K
|
||||
using ScaleN = ck_tile::MXScalePointer<ScaleType, 1, 32>;
|
||||
ScaleM scale_m(reinterpret_cast<ScaleType*>(scale_a_dev_buf.GetDeviceBuffer()));
|
||||
ScaleN scale_n(reinterpret_cast<ScaleType*>(scale_b_dev_buf.GetDeviceBuffer()));
|
||||
|
||||
float ave_time = invoke_mx_gemm<GemmConfig,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
ScaleM,
|
||||
ScaleN,
|
||||
UsePersistentKernel>(a_dev_buf,
|
||||
b_dev_buf,
|
||||
c_dev_buf,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
stride_A,
|
||||
stride_B,
|
||||
stride_C,
|
||||
kbatch,
|
||||
scale_m,
|
||||
scale_n,
|
||||
n_warmup,
|
||||
n_repeat);
|
||||
|
||||
(void)ave_time;
|
||||
|
||||
bool pass = true;
|
||||
if(validation > 0)
|
||||
{
|
||||
// get output data from device
|
||||
c_dev_buf.FromDevice(c_host.data());
|
||||
|
||||
// compute reference
|
||||
ck_tile::HostTensor<CDataType> c_m_n_host_ref(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
c_m_n_host_ref.SetZero();
|
||||
|
||||
ck_tile::reference_mx_gemm<ADataType, BDataType, ScaleType, AccDataType, CDataType>(
|
||||
a_host, b_host, c_m_n_host_ref, scale_a_host, scale_b_host);
|
||||
|
||||
const float max_accumulated_value =
|
||||
*std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
|
||||
const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
|
||||
K, max_accumulated_value);
|
||||
const double rtol = rtol_atol.at(ck_tile::number<0>{});
|
||||
const double atol = rtol_atol.at(ck_tile::number<1>{});
|
||||
pass = ck_tile::check_err(c_host, c_m_n_host_ref, "Error: Incorrect results!", rtol, atol);
|
||||
|
||||
std::cout << "Relative error threshold: " << rtol << " Absolute error threshold: " << atol
|
||||
<< std::endl;
|
||||
std::cout << "The CPU verification result is: " << (pass ? "correct" : "fail") << std::endl;
|
||||
}
|
||||
return pass ? 0 : -1;
|
||||
}
|
||||
|
||||
int run_mx_gemm_example(int argc, char* argv[])
|
||||
{
|
||||
auto [result, arg_parser] = create_args(argc, argv);
|
||||
if(!result)
|
||||
return -1;
|
||||
|
||||
using Row = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
std::string mx_prec = arg_parser.get_str("mx_prec");
|
||||
std::string a_layout = arg_parser.get_str("a_layout");
|
||||
std::string b_layout = arg_parser.get_str("b_layout");
|
||||
|
||||
if(a_layout == "R" && b_layout == "C")
|
||||
{
|
||||
if(mx_prec == "fp4" || mx_prec == "fp4xfp4")
|
||||
{
|
||||
return run_mx_gemm_with_layouts<ck_tile::pk_fp4_t,
|
||||
ck_tile::pk_fp4_t,
|
||||
float,
|
||||
MXfp4_GemmConfig16,
|
||||
true>(argc, argv, Row{}, Col{}, Row{});
|
||||
}
|
||||
else if(mx_prec == "fp8" || mx_prec == "fp8xfp8")
|
||||
{
|
||||
return run_mx_gemm_with_layouts<ck_tile::fp8_t,
|
||||
ck_tile::fp8_t,
|
||||
float,
|
||||
MXfp8_GemmConfig16,
|
||||
true>(argc, argv, Row{}, Col{}, Row{});
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Only fp4/8 is supported currently!");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
throw std::runtime_error("Only A=Row, B=Col layout is supported currently!");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
@@ -30,6 +30,7 @@ add_subdirectory(36_pooling)
|
||||
add_subdirectory(38_block_scale_gemm)
|
||||
add_subdirectory(40_streamk_gemm)
|
||||
add_subdirectory(41_batched_contraction)
|
||||
add_subdirectory(42_mx_gemm)
|
||||
add_subdirectory(50_sparse_attn)
|
||||
add_subdirectory(51_tile_distr_enc_reg_map)
|
||||
|
||||
|
||||
@@ -287,6 +287,7 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) operator[](number<I>) { TP_COM_(); return get<I>(); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) operator[](number<I>) const { TP_COM_(); return get<I>(); }
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) operator()(number<I>) { TP_COM_(); return get<I>(); } // TODO: compatible
|
||||
template<index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) operator()(number<I>) const { TP_COM_(); return get<I>(); }
|
||||
|
||||
// below function should be used under tuple_array<> type, no extra check will perform here
|
||||
template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as() { return reinterpret_cast<tuple_array<Tx, size()>&>(*this); }
|
||||
|
||||
@@ -78,6 +78,7 @@ struct static_distributed_tensor
|
||||
constexpr auto sliced_thread_tensor_desc =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(YSliceLengths...));
|
||||
|
||||
// divide element number by PackedSize to get the correct thread buffer size
|
||||
thread_buffer<DataType, sliced_thread_tensor_desc.get_element_space_size() / PackedSize>
|
||||
sliced_thread_data;
|
||||
|
||||
|
||||
@@ -666,13 +666,13 @@ struct HostTensor
|
||||
if constexpr(std::is_same_v<T, bf16_t> || std::is_same_v<T, fp16_t> ||
|
||||
std::is_same_v<T, fp8_t> || std::is_same_v<T, bf8_t>)
|
||||
{
|
||||
os << type_convert<float>(mData[idx]) << " #### ";
|
||||
os << type_convert<float>(mData[idx]);
|
||||
}
|
||||
else if constexpr(std::is_same_v<T, ck_tile::pk_int4_t>)
|
||||
{
|
||||
auto unpacked = pk_int4_t_to_int8x2_t(mData[idx]);
|
||||
os << "pk(" << static_cast<int>(unpacked[0]) << ", "
|
||||
<< static_cast<int>(unpacked[1]) << ") #### ";
|
||||
<< static_cast<int>(unpacked[1]) << ")";
|
||||
}
|
||||
else if constexpr(std::is_same_v<T, int8_t>)
|
||||
{
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <cstdlib>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
@@ -471,27 +472,42 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
|
||||
{
|
||||
AccDataType v_a;
|
||||
AccDataType v_b;
|
||||
if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
||||
if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
|
||||
{
|
||||
const pk_int4_t pk_val = a_element_op(a_m_k(m, k));
|
||||
// HostTensor automatically handles packed indexing: a_m_k(m,k) divides offset by
|
||||
// PackedSize So a_m_k(m,0) and a_m_k(m,1) return the same packed byte
|
||||
const pk_fp4_t pk_val = a_m_k(m, k);
|
||||
const fp32x2_t fp32_val = pk_val.to_fp32x2(1.0f);
|
||||
const float unpacked = (k % 2 == 1) ? fp32_val.hi : fp32_val.lo;
|
||||
v_a = ck_tile::type_convert<AccDataType>(a_element_op(unpacked));
|
||||
}
|
||||
else if constexpr(std::is_same_v<ADataType, pk_int4_t>)
|
||||
{
|
||||
// HostTensor automatically handles packed indexing
|
||||
const pk_int4_t pk_val = a_m_k(m, k);
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
v_a = fp32_val.lo;
|
||||
const float unpacked = (k % 2 == 1) ? fp32_val.hi : fp32_val.lo;
|
||||
v_a = ck_tile::type_convert<AccDataType>(a_element_op(unpacked));
|
||||
}
|
||||
else
|
||||
{
|
||||
v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
|
||||
}
|
||||
if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
||||
if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
|
||||
{
|
||||
const pk_int4_t pk_val = b_element_op(b_k_n(k, n));
|
||||
// HostTensor automatically handles packed indexing
|
||||
const pk_fp4_t pk_val = b_k_n(k, n);
|
||||
const fp32x2_t fp32_val = pk_val.to_fp32x2(1.0f);
|
||||
const float unpacked = (k % 2 == 1) ? fp32_val.hi : fp32_val.lo;
|
||||
v_b = ck_tile::type_convert<AccDataType>(b_element_op(unpacked));
|
||||
}
|
||||
else if constexpr(std::is_same_v<BDataType, pk_int4_t>)
|
||||
{
|
||||
// HostTensor automatically handles packed indexing
|
||||
const pk_int4_t pk_val = b_k_n(k, n);
|
||||
const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
|
||||
if(k % 2 == 1)
|
||||
v_b = fp32_val.hi;
|
||||
else
|
||||
v_b = fp32_val.lo;
|
||||
const float unpacked = (k % 2 == 1) ? fp32_val.hi : fp32_val.lo;
|
||||
v_b = ck_tile::type_convert<AccDataType>(b_element_op(unpacked));
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -671,7 +687,7 @@ CK_TILE_HOST void reference_mx_gemm(const HostTensor<ADataType>& a_m_k,
|
||||
b_k_n_scaled(k, n) = b_f4_lo * b_scale;
|
||||
b_k_n_scaled(k + 1, n) = b_f4_hi * b_scale;
|
||||
}
|
||||
else if constexpr(std::is_same_v<ADataType, pk_fp6x16_t>)
|
||||
else if constexpr(std::is_same_v<BDataType, pk_fp6x16_t>)
|
||||
{
|
||||
if(k % pk_fp6x16_t::packed_size != 0)
|
||||
continue;
|
||||
@@ -796,7 +812,7 @@ __global__ void naive_gemm_kernel(ADataType* A,
|
||||
}
|
||||
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a], 1.0f);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
@@ -816,7 +832,7 @@ __global__ void naive_gemm_kernel(ADataType* A,
|
||||
}
|
||||
else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b]);
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b], 1.0f);
|
||||
if(k % 2 == 1)
|
||||
v_b = fp32_val.hi;
|
||||
else
|
||||
@@ -908,7 +924,7 @@ __global__ void blockwise_gemm_kernel(ADataType* A,
|
||||
}
|
||||
else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
|
||||
{
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
|
||||
const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a], 1.0f);
|
||||
if(k % 2 == 1)
|
||||
v_a = fp32_val.hi;
|
||||
else
|
||||
|
||||
@@ -249,6 +249,113 @@ struct BlockGemmARegBRegCRegV1
|
||||
});
|
||||
}
|
||||
|
||||
// C += A * B with MX scaling
|
||||
// ScaleATensor: [MIterPerWarp, KIterPerWarp] -> int32_t
|
||||
// ScaleBTensor: [NIterPerWarp, KIterPerWarp] -> int32_t
|
||||
template <typename CBlockTensor,
|
||||
typename ABlockTensor,
|
||||
typename BBlockTensor,
|
||||
typename ScaleATensor,
|
||||
typename ScaleBTensor>
|
||||
CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
|
||||
const ABlockTensor& a_block_tensor,
|
||||
const BBlockTensor& b_block_tensor,
|
||||
const ScaleATensor& scale_a_tensor,
|
||||
const ScaleBTensor& scale_b_tensor) const
|
||||
{
|
||||
static_assert(std::is_same_v<ADataType, remove_cv_t<typename ABlockTensor::DataType>> &&
|
||||
std::is_same_v<BDataType, remove_cv_t<typename BBlockTensor::DataType>> &&
|
||||
std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
|
||||
"wrong!");
|
||||
|
||||
// check ABC-block-distribution
|
||||
static_assert(
|
||||
std::is_same_v<remove_cvref_t<decltype(MakeABlockDistributionEncode())>,
|
||||
remove_cvref_t<decltype(ABlockTensor::get_tile_distribution()
|
||||
.get_static_tile_distribution_encoding())>>,
|
||||
"A distribution is wrong!");
|
||||
static_assert(
|
||||
std::is_same_v<remove_cvref_t<decltype(MakeBBlockDistributionEncode())>,
|
||||
remove_cvref_t<decltype(BBlockTensor::get_tile_distribution()
|
||||
.get_static_tile_distribution_encoding())>>,
|
||||
"B distribution is wrong!");
|
||||
static_assert(
|
||||
std::is_same_v<remove_cvref_t<decltype(MakeCBlockDistributionEncode())>,
|
||||
remove_cvref_t<decltype(CBlockTensor::get_tile_distribution()
|
||||
.get_static_tile_distribution_encoding())>>,
|
||||
"C distribution is wrong!");
|
||||
|
||||
using AWarpDstr = typename WarpGemm::AWarpDstr;
|
||||
using BWarpDstr = typename WarpGemm::BWarpDstr;
|
||||
using CWarpDstr = typename WarpGemm::CWarpDstr;
|
||||
|
||||
using AWarpTensor = typename WarpGemm::AWarpTensor;
|
||||
using BWarpTensor = typename WarpGemm::BWarpTensor;
|
||||
using CWarpTensor = typename WarpGemm::CWarpTensor;
|
||||
|
||||
constexpr auto a_warp_y_lengths =
|
||||
to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
|
||||
constexpr auto b_warp_y_lengths =
|
||||
to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
|
||||
constexpr auto c_warp_y_lengths =
|
||||
to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
|
||||
|
||||
constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
|
||||
constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
|
||||
constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
|
||||
|
||||
// hot loop with MX scaling:
|
||||
static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
// read A warp tensor from A Block window
|
||||
AWarpTensor a_warp_tensor;
|
||||
a_warp_tensor.get_thread_buffer() = a_block_tensor.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
|
||||
|
||||
// get A scale for this M-K tile using get_y_sliced_thread_data
|
||||
auto scale_a_slice = scale_a_tensor.get_y_sliced_thread_data(
|
||||
sequence<kIter, mIter, 0>{}, sequence<1, 1, 1>{});
|
||||
const auto a_scale_e8m0 = scale_a_slice[number<0>{}];
|
||||
const int32_t a_scale = static_cast<int32_t>(a_scale_e8m0.get());
|
||||
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
// read B warp tensor from B block tensor
|
||||
BWarpTensor b_warp_tensor;
|
||||
b_warp_tensor.get_thread_buffer() = b_block_tensor.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
|
||||
// get B scale for this N-K tile using get_y_sliced_thread_data
|
||||
auto scale_b_slice = scale_b_tensor.get_y_sliced_thread_data(
|
||||
sequence<kIter, nIter, 0>{}, sequence<1, 1, 1>{});
|
||||
const auto b_scale_e8m0 = scale_b_slice[number<0>{}];
|
||||
const int32_t b_scale = static_cast<int32_t>(b_scale_e8m0.get());
|
||||
|
||||
// read C warp tensor from C block tensor
|
||||
using c_iter_idx = std::
|
||||
conditional_t<TransposeC, sequence<nIter, mIter>, sequence<mIter, nIter>>;
|
||||
CWarpTensor c_warp_tensor;
|
||||
c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
|
||||
merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
|
||||
|
||||
// warp GEMM with MX scaling
|
||||
// Cast e8m0_t to int32_t, use OpSel=0 (least significant byte)
|
||||
constexpr index_t kOpSel = 0; // Always use OpSel=0
|
||||
WarpGemm{}.template operator()<kOpSel, kOpSel>(
|
||||
c_warp_tensor, a_warp_tensor, b_warp_tensor, a_scale, b_scale);
|
||||
|
||||
// write C warp tensor into C block tensor
|
||||
c_block_tensor.set_y_sliced_thread_data(
|
||||
merge_sequences(c_iter_idx{}, c_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
|
||||
c_warp_tensor.get_thread_buffer());
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
|
||||
{
|
||||
using c_distr_ys_major = std::conditional_t<TransposeC, sequence<2, 1>, sequence<1, 2>>;
|
||||
|
||||
@@ -141,8 +141,11 @@ struct GemmPipelineAgBgCrImplBase
|
||||
auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
|
||||
|
||||
// TODO: LDS alignment should come from Policy!
|
||||
constexpr index_t a_lds_block_space_size_aligned = integer_least_multiple(
|
||||
sizeof(OverrideADataType) * a_lds_block_desc.get_element_space_size(), 16);
|
||||
constexpr index_t APackedSize = numeric_traits<OverrideADataType>::PackedSize;
|
||||
constexpr index_t a_lds_block_space_size =
|
||||
sizeof(OverrideADataType) * a_lds_block_desc.get_element_space_size() / APackedSize;
|
||||
constexpr index_t a_lds_block_space_size_aligned =
|
||||
integer_least_multiple(a_lds_block_space_size, 16);
|
||||
|
||||
// B tile in LDS
|
||||
OverrideBDataType* __restrict__ p_b_lds = static_cast<OverrideBDataType*>(
|
||||
|
||||
@@ -89,6 +89,8 @@ struct BaseGemmPipelineAgBgCrCompAsync
|
||||
"Invalid TailNumber: Only TailNumber::Three and TailNumber::Two are supported");
|
||||
#endif
|
||||
}
|
||||
|
||||
CK_TILE_HOST static constexpr auto GetName() { return "COMPUTE_ASYNC"; }
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
@@ -110,7 +110,7 @@ struct GemmPipelineProblemBase
|
||||
}
|
||||
else
|
||||
{
|
||||
return VectorLoadSize / sizeof(ADataType);
|
||||
return PackedSize * VectorLoadSize / sizeof(ADataType);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -536,14 +536,8 @@ struct UniversalGemmBasePolicy
|
||||
ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
|
||||
|
||||
// Assume DataType is even!
|
||||
if constexpr(XPerTile % (PackedSize * 32 / sizeof(DataType)) == 0 &&
|
||||
elements_per_thread % (PackedSize * 32 / sizeof(DataType)) == 0 &&
|
||||
PackedSize == 2)
|
||||
{
|
||||
return (PackedSize * 32 / sizeof(DataType));
|
||||
}
|
||||
else if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
|
||||
elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
|
||||
if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
|
||||
elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
|
||||
{
|
||||
return (PackedSize * 16 / sizeof(DataType));
|
||||
}
|
||||
@@ -861,30 +855,32 @@ struct UniversalGemmBasePolicy
|
||||
}
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_DEVICE static constexpr index_t GetSmemSizeA()
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
|
||||
{
|
||||
using ADataType = remove_cvref_t<typename Problem::ADataType>;
|
||||
constexpr auto APackedSize = numeric_traits<ADataType>::PackedSize;
|
||||
constexpr auto a_lds_block_desc = Derived::template MakeALdsBlockDescriptor<Problem>();
|
||||
constexpr index_t smem_size_a = integer_least_multiple(
|
||||
a_lds_block_desc.get_element_space_size() * sizeof(ADataType), 16);
|
||||
a_lds_block_desc.get_element_space_size() * sizeof(ADataType) / APackedSize, 16);
|
||||
return smem_size_a;
|
||||
}
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_DEVICE static constexpr index_t GetSmemSizeB()
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
|
||||
{
|
||||
constexpr bool IsBCastPolicyBeforeLDSWrite = IsBCastPolicyBeforeLDSWrite_v<Problem>;
|
||||
using BDataType = std::conditional_t<IsBCastPolicyBeforeLDSWrite,
|
||||
typename Problem::ADataType,
|
||||
typename Problem::BDataType>;
|
||||
constexpr auto BPackedSize = numeric_traits<BDataType>::PackedSize;
|
||||
constexpr auto b_lds_block_desc = Derived::template MakeBLdsBlockDescriptor<Problem>();
|
||||
constexpr index_t smem_size_b = integer_least_multiple(
|
||||
b_lds_block_desc.get_element_space_size() * sizeof(BDataType), 16);
|
||||
b_lds_block_desc.get_element_space_size() * sizeof(BDataType) / BPackedSize, 16);
|
||||
return smem_size_b;
|
||||
}
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_DEVICE static constexpr index_t GetSmemSize()
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
|
||||
{
|
||||
constexpr index_t smem_size_a = GetSmemSizeA<Problem>();
|
||||
constexpr index_t smem_size_b = GetSmemSizeB<Problem>();
|
||||
|
||||
@@ -1599,6 +1599,9 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4
|
||||
static constexpr index_t kCM0PerLane = 1;
|
||||
static constexpr index_t kCM1PerLane = 4;
|
||||
|
||||
// To get unity scale: 2^(kDefaultScale - 127) = 1.0
|
||||
static constexpr index_t kDefaultScale = 0x7F7F7F7F;
|
||||
|
||||
// c_vec += a_vec * b_vec
|
||||
template <index_t opselA, index_t opselB, bool post_nop_ = false>
|
||||
CK_TILE_DEVICE void operator()(CVecType& c_vec,
|
||||
@@ -1669,13 +1672,13 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4
|
||||
const BVecType& b_vec,
|
||||
bool_constant<post_nop_> = {}) const
|
||||
{
|
||||
operator()<0, 0>(c_vec, a_vec, 0, b_vec, 0);
|
||||
operator()<0, 0>(c_vec, a_vec, kDefaultScale, b_vec, kDefaultScale);
|
||||
}
|
||||
|
||||
// c_vec = a_vec * b_vec
|
||||
CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
|
||||
{
|
||||
return operator()<0, 0>(a_vec, 0, b_vec, 0);
|
||||
return operator()<0, 0>(a_vec, kDefaultScale, b_vec, kDefaultScale);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
9
include/ck_tile/ops/gemm_mx.hpp
Normal file
9
include/ck_tile/ops/gemm_mx.hpp
Normal file
@@ -0,0 +1,9 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ops/gemm_mx/kernel/gemm_mx_kernel.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async_default_policy.hpp"
|
||||
0
include/ck_tile/ops/gemm_mx/README.md
Normal file
0
include/ck_tile/ops/gemm_mx/README.md
Normal file
413
include/ck_tile/ops/gemm_mx/kernel/gemm_mx_kernel.hpp
Normal file
413
include/ck_tile/ops/gemm_mx/kernel/gemm_mx_kernel.hpp
Normal file
@@ -0,0 +1,413 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/ops/common.hpp"
|
||||
#include "ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ScaleM = MXScalePointer<e8m0_t, -1>,
|
||||
typename ScaleN = MXScalePointer<e8m0_t, -1>,
|
||||
index_t NumATensor = 1,
|
||||
index_t NumBTensor = 1,
|
||||
index_t NumDTensor = 0>
|
||||
struct MXGemmKernelArgs : UniversalGemmKernelArgs<NumATensor, NumBTensor, NumDTensor>
|
||||
{
|
||||
using Base = UniversalGemmKernelArgs<NumATensor, NumBTensor, NumDTensor>;
|
||||
|
||||
CK_TILE_HOST MXGemmKernelArgs(const std::array<const void*, NumATensor>& as_ptr_,
|
||||
const std::array<const void*, NumBTensor>& bs_ptr_,
|
||||
const std::array<const void*, NumDTensor>& ds_ptr_,
|
||||
void* e_ptr_,
|
||||
index_t k_batch_,
|
||||
index_t M_,
|
||||
index_t N_,
|
||||
index_t K_,
|
||||
const std::array<index_t, NumATensor>& stride_As_,
|
||||
const std::array<index_t, NumBTensor>& stride_Bs_,
|
||||
const std::array<index_t, NumDTensor>& stride_Ds_,
|
||||
index_t stride_E_,
|
||||
ScaleM scale_m_ptr_,
|
||||
ScaleN scale_n_ptr_)
|
||||
: Base{as_ptr_,
|
||||
bs_ptr_,
|
||||
ds_ptr_,
|
||||
e_ptr_,
|
||||
M_,
|
||||
N_,
|
||||
K_,
|
||||
stride_As_,
|
||||
stride_Bs_,
|
||||
stride_Ds_,
|
||||
stride_E_,
|
||||
k_batch_},
|
||||
scale_m_ptr(scale_m_ptr_),
|
||||
scale_n_ptr(scale_n_ptr_)
|
||||
{
|
||||
}
|
||||
|
||||
ScaleM scale_m_ptr;
|
||||
ScaleN scale_n_ptr;
|
||||
};
|
||||
|
||||
template <typename TilePartitioner_, typename MXGemmPipeline_, typename EpiloguePipeline_>
|
||||
struct MXGemmKernel : UniversalGemmKernel<TilePartitioner_, MXGemmPipeline_, EpiloguePipeline_>
|
||||
{
|
||||
using Underlying = UniversalGemmKernel<TilePartitioner_, MXGemmPipeline_, EpiloguePipeline_>;
|
||||
|
||||
using TilePartitioner = remove_cvref_t<TilePartitioner_>;
|
||||
using MXGemmPipeline = remove_cvref_t<MXGemmPipeline_>;
|
||||
using BlockGemmShape = remove_cvref_t<typename MXGemmPipeline::BlockGemmShape>;
|
||||
using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
|
||||
using ALayout = remove_cvref_t<typename MXGemmPipeline::ALayout>;
|
||||
using BLayout = remove_cvref_t<typename MXGemmPipeline::BLayout>;
|
||||
using ELayout = remove_cvref_t<typename MXGemmPipeline::CLayout>;
|
||||
using DsLayout = remove_cvref_t<typename EpiloguePipeline::DsLayout>;
|
||||
using DsDataType = remove_cvref_t<typename EpiloguePipeline::DsDataType>;
|
||||
static constexpr index_t KernelBlockSize = MXGemmPipeline::BlockSize;
|
||||
static constexpr bool UsePersistentKernel = MXGemmPipeline::UsePersistentKernel;
|
||||
|
||||
// Below type is actually accumulation data type - the output of block GEMM.
|
||||
using EDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
|
||||
|
||||
static constexpr auto I0 = number<0>();
|
||||
static constexpr auto I1 = number<1>();
|
||||
static constexpr auto I2 = number<2>();
|
||||
static constexpr auto I3 = number<3>();
|
||||
static constexpr auto I4 = number<4>();
|
||||
static constexpr auto I5 = number<5>();
|
||||
|
||||
static constexpr index_t NumATensor = Underlying::AsDataType::size();
|
||||
static constexpr index_t NumBTensor = Underlying::BsDataType::size();
|
||||
static constexpr index_t NumDTensor = Underlying::DsDataType::size();
|
||||
|
||||
using ADataType = remove_cvref_t<std::tuple_element_t<I0, typename Underlying::AsDataType>>;
|
||||
using BDataType = remove_cvref_t<std::tuple_element_t<I0, typename Underlying::BsDataType>>;
|
||||
|
||||
static constexpr auto MThreadPerXdl = BlockGemmShape::WarpTile::at(number<0>{});
|
||||
static constexpr auto NThreadPerXdl = BlockGemmShape::WarpTile::at(number<1>{});
|
||||
static constexpr auto KThreadPerXdl = 64 / MThreadPerXdl;
|
||||
|
||||
static constexpr auto APackedSize = numeric_traits<ADataType>::PackedSize;
|
||||
static constexpr auto BPackedSize = numeric_traits<BDataType>::PackedSize;
|
||||
|
||||
static constexpr int kBlockPerCu = 1;
|
||||
|
||||
static_assert(DsLayout::size() == DsDataType::size(),
|
||||
"The size of DsLayout and DsDataType should be the same");
|
||||
|
||||
[[nodiscard]] CK_TILE_HOST static const std::string GetName()
|
||||
{
|
||||
// clang-format off
|
||||
return concat('_', "mx_gemm", gemm_prec_str<ADataType, BDataType>, MXGemmPipeline::GetName());
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
template <typename ScaleM, typename ScaleN>
|
||||
using KernelArgs = MXGemmKernelArgs<ScaleM, ScaleN, NumATensor, NumBTensor, NumDTensor>;
|
||||
|
||||
template <typename ScaleM, typename ScaleN>
|
||||
CK_TILE_HOST static auto MakeKernelArgs(const std::array<const void*, NumATensor>& as_ptr,
|
||||
const std::array<const void*, NumBTensor>& bs_ptr,
|
||||
const std::array<const void*, NumDTensor>& ds_ptr,
|
||||
void* e_ptr,
|
||||
index_t k_batch,
|
||||
index_t M,
|
||||
index_t N,
|
||||
index_t K,
|
||||
const std::array<index_t, NumATensor>& stride_As,
|
||||
const std::array<index_t, NumBTensor>& stride_Bs,
|
||||
const std::array<index_t, NumDTensor>& stride_Ds,
|
||||
index_t stride_E,
|
||||
ScaleM scale_m_ptr,
|
||||
ScaleN scale_n_ptr)
|
||||
{
|
||||
return KernelArgs<ScaleM, ScaleN>(as_ptr,
|
||||
bs_ptr,
|
||||
ds_ptr,
|
||||
e_ptr,
|
||||
k_batch,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
stride_As,
|
||||
stride_Bs,
|
||||
stride_Ds,
|
||||
stride_E,
|
||||
scale_m_ptr,
|
||||
scale_n_ptr);
|
||||
}
|
||||
|
||||
template <class ScaleM, class ScaleN>
|
||||
CK_TILE_HOST static constexpr auto GridSize(const KernelArgs<ScaleM, ScaleN>& kargs)
|
||||
{
|
||||
const int total_work_tile_cnt = TilePartitioner::GridSize(kargs.M, kargs.N);
|
||||
|
||||
if constexpr(UsePersistentKernel)
|
||||
{
|
||||
hipDeviceProp_t prop;
|
||||
int deviceId = 0; // default device
|
||||
|
||||
int dync_smem_size = 0;
|
||||
int maxActiveBlocksPerCU = 0;
|
||||
|
||||
if(hipGetDeviceProperties(&prop, deviceId) != hipSuccess)
|
||||
throw std::runtime_error(std::string("hipGetDeviceProperties failed: ") +
|
||||
hipGetErrorName(hipGetLastError()));
|
||||
|
||||
if(hipOccupancyMaxActiveBlocksPerMultiprocessor(
|
||||
&maxActiveBlocksPerCU,
|
||||
reinterpret_cast<void*>(
|
||||
kentry<1, MXGemmKernel, remove_cvref_t<decltype(kargs)>>),
|
||||
KernelBlockSize,
|
||||
dync_smem_size) != hipSuccess)
|
||||
throw std::runtime_error(
|
||||
std::string("hipOccupancyMaxActiveBlocksPerMultiprocessor failed: ") +
|
||||
hipGetErrorName(hipGetLastError()));
|
||||
|
||||
const int persistent_block_size = prop.multiProcessorCount * maxActiveBlocksPerCU;
|
||||
const int actual_grid_size = min(persistent_block_size, total_work_tile_cnt);
|
||||
|
||||
return dim3(actual_grid_size, 1, 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Non-persistent: use full grid size based on number of tiles
|
||||
return dim3(total_work_tile_cnt, 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
using SplitKBatchOffset = typename Underlying::SplitKBatchOffset;
|
||||
|
||||
// Create C block window following UniversalGemmKernel pattern
|
||||
template <memory_operation_enum DstInMemOp = memory_operation_enum::set,
|
||||
typename ScaleM,
|
||||
typename ScaleN>
|
||||
CK_TILE_DEVICE static auto MakeCBlockWindows(EDataType* e_ptr,
|
||||
const KernelArgs<ScaleM, ScaleN>& kargs,
|
||||
const index_t i_m,
|
||||
const index_t i_n)
|
||||
{
|
||||
// Create tensor view for E/C tensor
|
||||
constexpr index_t vector_size = EpiloguePipeline::GetVectorSizeC();
|
||||
const auto& e_tensor_view = [&]() -> auto {
|
||||
if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
|
||||
e_ptr,
|
||||
make_tuple(kargs.M, kargs.N),
|
||||
make_tuple(kargs.stride_E, 1),
|
||||
number<vector_size>{},
|
||||
number<1>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return make_naive_tensor_view<address_space_enum::global, DstInMemOp>(
|
||||
e_ptr,
|
||||
make_tuple(kargs.M, kargs.N),
|
||||
make_tuple(1, kargs.stride_E),
|
||||
number<1>{},
|
||||
number<vector_size>{});
|
||||
}
|
||||
}();
|
||||
|
||||
// Create padded view
|
||||
const auto& e_pad_view = [&]() {
|
||||
if constexpr(std::is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
|
||||
{
|
||||
return pad_tensor_view(e_tensor_view,
|
||||
make_tuple(number<TilePartitioner::MPerBlock>{},
|
||||
number<TilePartitioner::NPerBlock>{}),
|
||||
sequence<false, false>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return pad_tensor_view(e_tensor_view,
|
||||
make_tuple(number<TilePartitioner::MPerBlock>{},
|
||||
number<TilePartitioner::NPerBlock>{}),
|
||||
sequence<false, false>{});
|
||||
}
|
||||
}();
|
||||
|
||||
// Create block window
|
||||
auto c_block_window = make_tile_window(
|
||||
e_pad_view,
|
||||
make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
|
||||
{i_m, i_n});
|
||||
|
||||
return c_block_window;
|
||||
}
|
||||
|
||||
// Create scale A block windows following the pattern of MakeABlockWindows
|
||||
template <typename ScaleM, typename ScaleN>
|
||||
CK_TILE_DEVICE static auto MakeScaleABlockWindows(const KernelArgs<ScaleM, ScaleN>& kargs,
|
||||
const index_t i_m)
|
||||
{
|
||||
auto scale_a = kargs.scale_m_ptr;
|
||||
|
||||
static constexpr int BlockScaleSize = ScaleM::GranularityK;
|
||||
const auto scale_k_size = kargs.K / BlockScaleSize;
|
||||
|
||||
// A scale tensor view - layout [M, scale_k_size] with e8m0_t elements
|
||||
// Use e8m0_t directly without packing
|
||||
const auto scale_a_tensor_view = make_naive_tensor_view<address_space_enum::global>(
|
||||
reinterpret_cast<const e8m0_t*>(scale_a.ptr),
|
||||
make_tuple(kargs.M, scale_k_size),
|
||||
make_tuple(scale_k_size, 1));
|
||||
|
||||
// Create block window for scale A
|
||||
// K dimension: scale_k_size e8m0_t elements
|
||||
// i_m is element offset (iM * MPerBlock), not tile index
|
||||
auto scale_a_block_window =
|
||||
make_tile_window(scale_a_tensor_view,
|
||||
make_tuple(number<TilePartitioner::MPerBlock>{},
|
||||
number<TilePartitioner::KPerBlock / BlockScaleSize>{}),
|
||||
{i_m, 0});
|
||||
|
||||
return scale_a_block_window;
|
||||
}
|
||||
|
||||
// Create scale B block windows following the pattern of MakeBBlockWindows
|
||||
template <typename ScaleM, typename ScaleN>
|
||||
CK_TILE_DEVICE static auto MakeScaleBBlockWindows(const KernelArgs<ScaleM, ScaleN>& kargs,
|
||||
const index_t i_n)
|
||||
{
|
||||
auto scale_b = kargs.scale_n_ptr;
|
||||
|
||||
static constexpr int BlockScaleSize = ScaleN::GranularityK;
|
||||
const auto scale_k_size = kargs.K / BlockScaleSize;
|
||||
|
||||
// B scale tensor view
|
||||
// Host stores as [K/32, N] col-major = [N, K/32] row-major from access perspective
|
||||
const auto scale_b_tensor_view = make_naive_tensor_view<address_space_enum::global>(
|
||||
reinterpret_cast<const e8m0_t*>(scale_b.ptr),
|
||||
make_tuple(kargs.N, scale_k_size), // [N, K/32] for access
|
||||
make_tuple(scale_k_size, 1)); // stride to match col-major storage
|
||||
|
||||
// Create block window for scale B
|
||||
// Tile window shape matches access pattern: [NPerBlock, KPerBlock/32]
|
||||
// i_n is element offset (iN * NPerBlock)
|
||||
auto scale_b_block_window =
|
||||
make_tile_window(scale_b_tensor_view,
|
||||
make_tuple(number<TilePartitioner::NPerBlock>{},
|
||||
number<TilePartitioner::KPerBlock / BlockScaleSize>{}),
|
||||
{i_n, 0});
|
||||
|
||||
return scale_b_block_window;
|
||||
}
|
||||
|
||||
template <class ScaleM, class ScaleN>
|
||||
CK_TILE_DEVICE static void RunMxGemm(const std::array<const ADataType*, NumATensor>& as_ptr,
|
||||
const std::array<const BDataType*, NumBTensor>& bs_ptr,
|
||||
const std::array<const void*, NumDTensor>& ds_ptr,
|
||||
EDataType* e_ptr,
|
||||
void* smem_ptr_ping,
|
||||
void* smem_ptr_pong,
|
||||
const KernelArgs<ScaleM, ScaleN>& kargs,
|
||||
const SplitKBatchOffset& splitk_batch_offset,
|
||||
const index_t i_m,
|
||||
const index_t i_n)
|
||||
{
|
||||
// Create block windows directly, following the new pattern from UniversalGemmKernel
|
||||
// i_m and i_n are element offsets (iM * MPerBlock, iN * NPerBlock), not tile indices
|
||||
const auto& a_block_window =
|
||||
Underlying::MakeABlockWindows(as_ptr, kargs, splitk_batch_offset.splitted_k, i_m);
|
||||
const auto& b_block_window =
|
||||
Underlying::MakeBBlockWindows(bs_ptr, kargs, splitk_batch_offset.splitted_k, i_n);
|
||||
const auto& d_block_window = Underlying::MakeDBlockWindows(ds_ptr, kargs, i_m, i_n);
|
||||
|
||||
// Create scale block windows using our new functions
|
||||
const auto& scale_a_block_window = MakeScaleABlockWindows(kargs, i_m);
|
||||
const auto& scale_b_block_window = MakeScaleBBlockWindows(kargs, i_n);
|
||||
|
||||
const index_t num_loop = TilePartitioner::GetLoopNum(splitk_batch_offset.splitted_k);
|
||||
|
||||
static_assert(ScaleM::GranularityK == ScaleN::GranularityK // have the same granK
|
||||
|| ScaleM::GranularityMN == -1 // or ScaleA is disable
|
||||
|| ScaleN::GranularityMN == -1, // or ScaleB is disable
|
||||
"ScaleM and ScaleN should have the same GranularityK");
|
||||
|
||||
const auto& c_block_tile = MXGemmPipeline{}(a_block_window[number<0>{}],
|
||||
b_block_window[number<0>{}],
|
||||
scale_a_block_window,
|
||||
scale_b_block_window,
|
||||
num_loop,
|
||||
smem_ptr_ping,
|
||||
smem_ptr_pong);
|
||||
|
||||
// Run Epilogue Pipeline - create C block window directly
|
||||
auto c_block_window = MakeCBlockWindows(e_ptr, kargs, i_m, i_n);
|
||||
EpiloguePipeline{}(c_block_window, c_block_tile, d_block_window, smem_ptr_ping);
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemPingSize()
|
||||
{
|
||||
return max(MXGemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemPongSize()
|
||||
{
|
||||
return MXGemmPipeline::GetSmemSize();
|
||||
}
|
||||
|
||||
template <class ScaleM, class ScaleN>
|
||||
CK_TILE_DEVICE void operator()(KernelArgs<ScaleM, ScaleN> kargs,
|
||||
int partition_idx = get_block_id()) const
|
||||
{
|
||||
const int total_work_tile_cnt =
|
||||
amd_wave_read_first_lane(TilePartitioner::GridSize(kargs.M, kargs.N));
|
||||
|
||||
// Allocate shared memory for ping pong buffers
|
||||
__shared__ char smem_ptr_ping[GetSmemPingSize()];
|
||||
__shared__ char smem_ptr_pong[GetSmemPongSize()];
|
||||
|
||||
// Support both persistent and non-persistent modes
|
||||
do
|
||||
{
|
||||
const auto [iM, iN] =
|
||||
TilePartitioner{kargs.M, kargs.N}.GetOutputTileIndex(partition_idx);
|
||||
const index_t i_m = amd_wave_read_first_lane(iM * TilePartitioner::MPerBlock);
|
||||
const index_t i_n = amd_wave_read_first_lane(iN * TilePartitioner::NPerBlock);
|
||||
|
||||
// Cast to base class for SplitKBatchOffset construction
|
||||
const SplitKBatchOffset splitk_batch_offset(
|
||||
static_cast<const typename Underlying::KernelArgs&>(kargs));
|
||||
// options
|
||||
EDataType* e_ptr = static_cast<EDataType*>(kargs.e_ptr);
|
||||
|
||||
// options
|
||||
std::array<const ADataType*, NumATensor> as_ptr;
|
||||
static_for<0, NumATensor, 1>{}([&](auto i) {
|
||||
as_ptr[i] = static_cast<const ADataType*>(kargs.as_ptr[i]) +
|
||||
splitk_batch_offset.as_k_split_offset[i] / APackedSize;
|
||||
});
|
||||
|
||||
std::array<const BDataType*, NumBTensor> bs_ptr;
|
||||
static_for<0, NumBTensor, 1>{}([&](auto i) {
|
||||
bs_ptr[i] = static_cast<const BDataType*>(kargs.bs_ptr[i]) +
|
||||
splitk_batch_offset.bs_k_split_offset[i] / BPackedSize;
|
||||
});
|
||||
|
||||
RunMxGemm<ScaleM, ScaleN>(as_ptr,
|
||||
bs_ptr,
|
||||
kargs.ds_ptr,
|
||||
e_ptr,
|
||||
smem_ptr_ping,
|
||||
smem_ptr_pong,
|
||||
kargs,
|
||||
splitk_batch_offset,
|
||||
i_m,
|
||||
i_n);
|
||||
partition_idx += gridDim.x;
|
||||
} while(UsePersistentKernel && partition_idx < total_work_tile_cnt);
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
113
include/ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp
Normal file
113
include/ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp
Normal file
@@ -0,0 +1,113 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename ScaleType, int SharedGranularityMN, int SharedGranularityK = 0>
|
||||
struct MXScalePointer
|
||||
{
|
||||
static constexpr int GranularityMN = SharedGranularityMN;
|
||||
static constexpr int GranularityK = SharedGranularityK;
|
||||
|
||||
static_assert(GranularityK != 0,
|
||||
"GranularityK cannot be zero in primary template; "
|
||||
"use the partial specialization for GranularityK == 0");
|
||||
|
||||
const ScaleType* ptr;
|
||||
|
||||
CK_TILE_HOST_DEVICE MXScalePointer() = default;
|
||||
CK_TILE_HOST_DEVICE MXScalePointer(const ScaleType* ptr_) : ptr(ptr_) {}
|
||||
CK_TILE_HOST_DEVICE MXScalePointer(const ScaleType* ptr_, [[maybe_unused]] index_t length_)
|
||||
: ptr(ptr_)
|
||||
{
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE MXScalePointer operator+(index_t offset) const
|
||||
{
|
||||
MXScalePointer ret;
|
||||
if constexpr(GranularityMN == 0)
|
||||
{
|
||||
ret.ptr = ptr + offset / GranularityK;
|
||||
}
|
||||
else
|
||||
{
|
||||
ret.ptr = ptr + offset / GranularityMN / GranularityK;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE ScaleType operator[](index_t i) const = delete;
|
||||
};
|
||||
|
||||
template <typename ScaleType, int SharedGranularityMN>
|
||||
struct MXScalePointer<ScaleType, SharedGranularityMN, 0>
|
||||
{
|
||||
static constexpr int GranularityMN = SharedGranularityMN;
|
||||
static constexpr int GranularityK = 0;
|
||||
|
||||
static_assert(GranularityMN != 0);
|
||||
|
||||
const ScaleType* ptr;
|
||||
index_t length;
|
||||
|
||||
CK_TILE_HOST_DEVICE MXScalePointer() = default;
|
||||
CK_TILE_HOST_DEVICE MXScalePointer(const ScaleType* ptr_) : ptr(ptr_), length(1) {}
|
||||
CK_TILE_HOST_DEVICE MXScalePointer(const ScaleType* ptr_, index_t length_)
|
||||
: ptr(ptr_), length(length_)
|
||||
{
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE MXScalePointer operator+(index_t offset) const
|
||||
{
|
||||
MXScalePointer ret;
|
||||
if constexpr(GranularityMN == 1)
|
||||
{
|
||||
ret.ptr = ptr + offset;
|
||||
ret.length = length - offset;
|
||||
}
|
||||
else
|
||||
{
|
||||
ret.ptr = ptr + offset / GranularityMN;
|
||||
ret.length = length - offset / GranularityMN;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE ScaleType operator[](index_t i) const
|
||||
{
|
||||
// with additional oob check
|
||||
if constexpr(GranularityMN == 1)
|
||||
return i < length ? ptr[i] : 0;
|
||||
else
|
||||
return i / GranularityMN < length ? ptr[i / GranularityMN] : 0;
|
||||
}
|
||||
};
|
||||
|
||||
// shared granularityMN = -1 means no scale
|
||||
template <typename ScaleType>
|
||||
struct MXScalePointer<ScaleType, -1, 0>
|
||||
{
|
||||
static constexpr int GranularityMN = -1;
|
||||
static constexpr int GranularityK = 0;
|
||||
|
||||
const ScaleType* ptr = nullptr;
|
||||
|
||||
CK_TILE_HOST_DEVICE constexpr MXScalePointer() = default;
|
||||
CK_TILE_HOST_DEVICE constexpr MXScalePointer(const ScaleType*) {}
|
||||
CK_TILE_HOST_DEVICE constexpr MXScalePointer(const ScaleType*, index_t) {}
|
||||
|
||||
CK_TILE_HOST_DEVICE constexpr MXScalePointer operator+(index_t) const
|
||||
{
|
||||
return MXScalePointer{};
|
||||
}
|
||||
CK_TILE_HOST_DEVICE constexpr ScaleType operator[](index_t) const
|
||||
{
|
||||
return 1; // alway return 1, it doesn't change the result
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,723 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/core/arch/arch.hpp"
|
||||
#include "ck_tile/core/tensor/load_tile.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async_default_policy.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// A Tile Window: global memory
|
||||
// B Tile Window: global memory
|
||||
// C Distributed tensor: register
|
||||
// MX scaling support with OpSel
|
||||
template <typename Problem>
|
||||
struct BaseMXGemmPipelineAgBgCrCompAsync
|
||||
{
|
||||
static constexpr index_t PrefetchStages = 2;
|
||||
static constexpr index_t PrefillStages = 1;
|
||||
static constexpr index_t GlobalBufferNum = 1;
|
||||
|
||||
static constexpr bool UsePersistentKernel = Problem::Traits::UsePersistentKernel;
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr bool BlockHasHotloop(index_t num_loop)
|
||||
{
|
||||
return num_loop > PrefetchStages;
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr TailNumber GetBlockLoopTailNum(index_t num_loop)
|
||||
{
|
||||
if(num_loop == 1)
|
||||
{
|
||||
return TailNumber::One;
|
||||
}
|
||||
if(num_loop % PrefetchStages == 1)
|
||||
{
|
||||
return TailNumber::Three;
|
||||
}
|
||||
else
|
||||
{
|
||||
return TailNumber::Two;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename RunFunction>
|
||||
CK_TILE_HOST_DEVICE static auto
|
||||
TailHandler(const RunFunction& run_func, bool has_hot_loop, TailNumber tail_number)
|
||||
{
|
||||
// Handle all the valid cases.
|
||||
if(has_hot_loop)
|
||||
{
|
||||
if(tail_number == TailNumber::Three)
|
||||
{
|
||||
return run_func(bool_constant<true>{},
|
||||
integral_constant<TailNumber, TailNumber::Three>{});
|
||||
}
|
||||
else if(tail_number == TailNumber::Two)
|
||||
{
|
||||
return run_func(bool_constant<true>{},
|
||||
integral_constant<TailNumber, TailNumber::Two>{});
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if(tail_number == TailNumber::Three)
|
||||
{
|
||||
return run_func(bool_constant<false>{},
|
||||
integral_constant<TailNumber, TailNumber::Three>{});
|
||||
}
|
||||
else if(tail_number == TailNumber::Two)
|
||||
{
|
||||
return run_func(bool_constant<false>{},
|
||||
integral_constant<TailNumber, TailNumber::Two>{});
|
||||
}
|
||||
else
|
||||
{
|
||||
return (run_func(bool_constant<false>{},
|
||||
integral_constant<TailNumber, TailNumber::One>{}));
|
||||
}
|
||||
}
|
||||
// If execution reaches here, it's an invalid tail_number because it wasn't handled above.
|
||||
#if defined(__HIP_DEVICE_COMPILE__)
|
||||
__builtin_unreachable();
|
||||
#else
|
||||
throw std::logic_error(
|
||||
"Invalid TailNumber: Only TailNumber::Three and TailNumber::Two are supported");
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* @brief MX GEMM compute optimized pipeline version async; which is based on V4.
|
||||
*
|
||||
* This pipeline introduces asynchronous load from global memory to LDS,
|
||||
* skipping the intermediate loading into pipeline registers.
|
||||
* Supports MX scaling with e8m0 packed values and OpSel.
|
||||
*/
|
||||
template <typename Problem, typename Policy = MXGemmPipelineAgBgCrCompAsyncDefaultPolicy>
|
||||
struct MXGemmPipelineAgBgCrCompAsync : public BaseMXGemmPipelineAgBgCrCompAsync<Problem>
|
||||
{
|
||||
using Base = BaseMXGemmPipelineAgBgCrCompAsync<Problem>;
|
||||
using PipelineImplBase = GemmPipelineAgBgCrImplBase<Problem, Policy>;
|
||||
|
||||
using AsDataType = remove_cvref_t<typename Problem::AsDataTypeTuple>;
|
||||
using BsDataType = remove_cvref_t<typename Problem::BsDataTypeTuple>;
|
||||
using CDataType = remove_cvref_t<typename Problem::CDataType>;
|
||||
using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
|
||||
|
||||
using AsLayout = remove_cvref_t<typename Problem::AsLayoutTuple>;
|
||||
using BsLayout = remove_cvref_t<typename Problem::BsLayoutTuple>;
|
||||
using CLayout = remove_cvref_t<typename Problem::CLayout>;
|
||||
|
||||
using AElementWise = remove_cvref_t<typename Problem::AElementWise>;
|
||||
using BElementWise = remove_cvref_t<typename Problem::BElementWise>;
|
||||
|
||||
using ALayout = remove_cvref_t<std::tuple_element_t<0, AsLayout>>;
|
||||
using BLayout = remove_cvref_t<std::tuple_element_t<0, BsLayout>>;
|
||||
|
||||
using ADataType = remove_cvref_t<std::tuple_element_t<0, AsDataType>>;
|
||||
using BDataType = remove_cvref_t<std::tuple_element_t<0, BsDataType>>;
|
||||
|
||||
static_assert(!std::is_same_v<BDataType, pk_int4_t>, "Not implemented");
|
||||
|
||||
// Each scale covers 32 K elements
|
||||
static constexpr index_t ScaleBlockSize = 32;
|
||||
|
||||
static constexpr index_t APackedSize =
|
||||
ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
|
||||
static constexpr index_t BPackedSize =
|
||||
ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
|
||||
|
||||
using BlockGemm = remove_cvref_t<decltype(Policy::template GetBlockGemm<Problem>())>;
|
||||
using I0 = number<0>;
|
||||
using I1 = number<1>;
|
||||
using I2 = number<2>;
|
||||
|
||||
static constexpr index_t BlockSize = Problem::kBlockSize;
|
||||
|
||||
static constexpr index_t MPerBlock = BlockGemmShape::kM;
|
||||
static constexpr index_t NPerBlock = BlockGemmShape::kN;
|
||||
static constexpr index_t KPerBlock = BlockGemmShape::kK;
|
||||
|
||||
template <bool IsWave32Host = false>
|
||||
static constexpr index_t GetVectorSizeA()
|
||||
{
|
||||
return Policy::template GetVectorSizeA<Problem, IsWave32Host>();
|
||||
}
|
||||
template <bool IsWave32Host = false>
|
||||
static constexpr index_t GetVectorSizeB()
|
||||
{
|
||||
return Policy::template GetVectorSizeB<Problem, IsWave32Host>();
|
||||
}
|
||||
static constexpr index_t GetVectorSizeC() { return Policy::template GetVectorSizeC<Problem>(); }
|
||||
|
||||
static constexpr index_t GetSmemPackA() { return Policy::template GetSmemPackA<Problem>(); }
|
||||
static constexpr index_t GetSmemPackB() { return Policy::template GetSmemPackB<Problem>(); }
|
||||
|
||||
static constexpr index_t NumWaveGroups = Problem::NumWaveGroups;
|
||||
static constexpr index_t Preshuffle = Problem::Preshuffle;
|
||||
|
||||
static constexpr bool kPadM = Problem::kPadM;
|
||||
static constexpr bool kPadN = Problem::kPadN;
|
||||
static constexpr bool kPadK = Problem::kPadK;
|
||||
|
||||
static constexpr bool DoubleSmemBuffer = Problem::DoubleSmemBuffer;
|
||||
|
||||
static constexpr auto Scheduler = Problem::Scheduler;
|
||||
|
||||
static constexpr auto is_a_load_tr_v = bool_constant<PipelineImplBase::is_a_load_tr>{};
|
||||
static constexpr auto is_b_load_tr_v = bool_constant<PipelineImplBase::is_b_load_tr>{};
|
||||
|
||||
[[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
|
||||
{
|
||||
// clang-format off
|
||||
return "COMPUTE_ASYNC";
|
||||
// clang-format on
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
|
||||
{
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
}
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr auto IsTransposeC()
|
||||
{
|
||||
return Policy::template IsTransposeC<Problem>();
|
||||
}
|
||||
|
||||
template <GemmPipelineScheduler Scheduler>
|
||||
struct PipelineImpl : public PipelineImplBase
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PipelineImpl<GemmPipelineScheduler::Intrawave> : public PipelineImplBase
|
||||
{
|
||||
using Base = PipelineImplBase;
|
||||
|
||||
CK_TILE_DEVICE static constexpr auto HotLoopScheduler()
|
||||
{
|
||||
constexpr index_t MPerXDL = BlockGemmShape::WarpTile::at(I0{});
|
||||
constexpr index_t NPerXDL = BlockGemmShape::WarpTile::at(I1{});
|
||||
constexpr index_t KPerXDL = BlockGemmShape::WarpTile::at(I2{});
|
||||
|
||||
constexpr index_t WaveSize = get_warp_size();
|
||||
|
||||
constexpr index_t A_Buffer_Load_Inst_Num =
|
||||
MPerBlock * KPerBlock / (BlockSize * GetVectorSizeA());
|
||||
constexpr index_t B_Buffer_Load_Inst_Num =
|
||||
NPerBlock * KPerBlock / (BlockSize * GetVectorSizeB());
|
||||
|
||||
constexpr index_t C_MFMA_Inst_Num = MPerBlock * NPerBlock * KPerBlock /
|
||||
(BlockSize / WaveSize) /
|
||||
(MPerXDL * NPerXDL * KPerXDL);
|
||||
|
||||
constexpr auto num_buffer_load_inst = A_Buffer_Load_Inst_Num + B_Buffer_Load_Inst_Num;
|
||||
constexpr auto num_issue = num_buffer_load_inst;
|
||||
|
||||
static_for<0, num_buffer_load_inst, 1>{}([&](auto i) {
|
||||
// TODO: this will likely need to be redesigned after (1) changes to reading from
|
||||
// LDS and (2) re-profiling
|
||||
ignore = i;
|
||||
__builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0); // MFMA : 1
|
||||
__builtin_amdgcn_sched_group_barrier(
|
||||
LLVMSchedGroupMask::DS_READ, 1, 0); // DS read : 1
|
||||
__builtin_amdgcn_sched_group_barrier(LLVMSchedGroupMask::MFMA, 1, 0); // MFMA: 1
|
||||
__builtin_amdgcn_sched_group_barrier(
|
||||
LLVMSchedGroupMask::VMEM_READ, 1, 0); // VMEM read :1
|
||||
__builtin_amdgcn_sched_group_barrier(
|
||||
LLVMSchedGroupMask::MFMA, C_MFMA_Inst_Num / num_issue - 2, 0); // MFMA : 6
|
||||
});
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
}
|
||||
|
||||
template <bool HasHotLoop,
|
||||
TailNumber TailNum,
|
||||
typename AsDramBlockWindowTmp,
|
||||
typename BsDramBlockWindowTmp,
|
||||
typename ScaleADramBlockWindowTmp,
|
||||
typename ScaleBDramBlockWindowTmp,
|
||||
typename AElementFunction,
|
||||
typename BElementFunction,
|
||||
typename std::enable_if_t<is_detected<is_tuple, AsDramBlockWindowTmp>::value &&
|
||||
is_detected<is_tuple, BsDramBlockWindowTmp>::value,
|
||||
bool>* = nullptr>
|
||||
CK_TILE_DEVICE auto operator()(const AsDramBlockWindowTmp& a_dram_block_window_tmp,
|
||||
const AElementFunction& a_element_func,
|
||||
const BsDramBlockWindowTmp& b_dram_block_window_tmp,
|
||||
const BElementFunction& b_element_func,
|
||||
const ScaleADramBlockWindowTmp& scale_a_window,
|
||||
const ScaleBDramBlockWindowTmp& scale_b_window,
|
||||
index_t num_loop,
|
||||
void* __restrict__ p_smem_0,
|
||||
void* __restrict__ p_smem_1) const
|
||||
{
|
||||
// TODO support multi-ABD
|
||||
static_assert(1 == std::tuple_size_v<AsDramBlockWindowTmp>);
|
||||
static_assert(1 == std::tuple_size_v<BsDramBlockWindowTmp>);
|
||||
using ADramBlockWindowTmp =
|
||||
remove_cvref_t<std::tuple_element_t<number<0>{}, AsDramBlockWindowTmp>>;
|
||||
using BDramBlockWindowTmp =
|
||||
remove_cvref_t<std::tuple_element_t<number<0>{}, BsDramBlockWindowTmp>>;
|
||||
// TODO currently fused elementwise are not supported
|
||||
ignore = a_element_func;
|
||||
ignore = b_element_func;
|
||||
static_assert(std::is_same_v<remove_cvref_t<decltype(a_element_func)>,
|
||||
element_wise::PassThrough>);
|
||||
static_assert(std::is_same_v<remove_cvref_t<decltype(b_element_func)>,
|
||||
element_wise::PassThrough>);
|
||||
static_assert(
|
||||
std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
|
||||
std::is_same_v<BDataType,
|
||||
remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
|
||||
"Data Type conflict on A and B matrix input data type.");
|
||||
|
||||
constexpr bool is_a_col_major =
|
||||
std::is_same_v<ALayout, tensor_layout::gemm::ColumnMajor>;
|
||||
constexpr bool is_b_row_major = std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>;
|
||||
|
||||
static_assert(is_a_col_major
|
||||
? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
|
||||
MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}])
|
||||
: (MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
|
||||
KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]),
|
||||
"A block window has incorrect lengths for defined ALayout!");
|
||||
static_assert(is_b_row_major
|
||||
? (KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
|
||||
NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}])
|
||||
: (NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] &&
|
||||
KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]),
|
||||
"B block window has incorrect lengths for defined BLayout!");
|
||||
|
||||
////////////// global window & register /////////////////
|
||||
// A DRAM tile window(s) for load
|
||||
auto a_tile_windows = generate_tuple(
|
||||
[&](auto idx) {
|
||||
return make_tile_window(
|
||||
a_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
|
||||
make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
|
||||
a_dram_block_window_tmp[number<idx>{}].get_window_origin(),
|
||||
Policy::template MakeADramTileDistribution<Problem>());
|
||||
},
|
||||
number<AsLayout::size()>{});
|
||||
// B DRAM window(s) for load
|
||||
auto b_tile_windows = generate_tuple(
|
||||
[&](auto idx) {
|
||||
return make_tile_window(
|
||||
b_dram_block_window_tmp[number<idx>{}].get_bottom_tensor_view(),
|
||||
make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
|
||||
b_dram_block_window_tmp[number<idx>{}].get_window_origin(),
|
||||
Policy::template MakeBDramTileDistribution<Problem>());
|
||||
},
|
||||
number<BsLayout::size()>{});
|
||||
|
||||
////////////// MX Scale windows /////////////////
|
||||
// Get WarpGemm configuration
|
||||
using BlockWarps = typename BlockGemmShape::BlockWarps;
|
||||
constexpr index_t MWarp = BlockWarps::at(I0{});
|
||||
constexpr index_t NWarp = BlockWarps::at(I1{});
|
||||
|
||||
// Calculate scale dimensions: KPerBlock elements need KPerBlock/32 e8m0_t scales
|
||||
constexpr index_t ScaleKDimPerBlock = KPerBlock / ScaleBlockSize;
|
||||
|
||||
// Scale tensor views and base origins for creating tile windows per iteration
|
||||
const auto& scale_a_tensor_view = scale_a_window.get_bottom_tensor_view();
|
||||
const auto& scale_b_tensor_view = scale_b_window.get_bottom_tensor_view();
|
||||
auto scale_a_base_origin = scale_a_window.get_window_origin();
|
||||
auto scale_b_base_origin = scale_b_window.get_window_origin();
|
||||
|
||||
// Create sample scale windows to determine tile types
|
||||
auto scale_a_dram_window =
|
||||
make_tile_window(scale_a_tensor_view,
|
||||
make_tuple(number<MPerBlock>{}, number<ScaleKDimPerBlock>{}),
|
||||
scale_a_base_origin,
|
||||
Policy::template MakeMX_ScaleA_DramTileDistribution<Problem>());
|
||||
|
||||
auto scale_b_dram_window =
|
||||
make_tile_window(scale_b_tensor_view,
|
||||
make_tuple(number<NPerBlock>{}, number<ScaleKDimPerBlock>{}),
|
||||
scale_b_base_origin,
|
||||
Policy::template MakeMX_ScaleB_DramTileDistribution<Problem>());
|
||||
|
||||
// this pipeline has a pair of LDS buffers per logical tile
|
||||
auto&& [a_lds_block0, b_lds_block0] = Base::GetABLdsTensorViews(p_smem_0);
|
||||
auto&& [a_lds_block1, b_lds_block1] = Base::GetABLdsTensorViews(p_smem_1);
|
||||
|
||||
constexpr auto a_lds_shape = []() {
|
||||
if constexpr(is_a_load_tr_v)
|
||||
return make_tuple(number<KPerBlock>{}, number<MPerBlock>{});
|
||||
else
|
||||
return make_tuple(number<MPerBlock>{}, number<KPerBlock>{});
|
||||
}();
|
||||
|
||||
constexpr auto b_lds_shape = []() {
|
||||
if constexpr(is_b_load_tr_v)
|
||||
return make_tuple(number<KPerBlock>{}, number<NPerBlock>{});
|
||||
else
|
||||
return make_tuple(number<NPerBlock>{}, number<KPerBlock>{});
|
||||
}();
|
||||
|
||||
// LDS tile windows for storing, one per LDS buffer
|
||||
auto a_copy_lds_window0 = make_tile_window(a_lds_block0, a_lds_shape, {0, 0});
|
||||
|
||||
auto a_copy_lds_window1 = make_tile_window(a_lds_block1, a_lds_shape, {0, 0});
|
||||
|
||||
auto b_copy_lds_window0 = make_tile_window(b_lds_block0, b_lds_shape, {0, 0});
|
||||
|
||||
auto b_copy_lds_window1 = make_tile_window(b_lds_block1, b_lds_shape, {0, 0});
|
||||
|
||||
// initialize DRAM window steps, used to advance the DRAM windows
|
||||
using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex;
|
||||
using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex;
|
||||
constexpr ADramTileWindowStep a_dram_tile_window_step =
|
||||
is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
|
||||
constexpr BDramTileWindowStep b_dram_tile_window_step =
|
||||
is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock);
|
||||
|
||||
// read A(0), B(0) from DRAM to LDS window(0)
|
||||
// and advance the DRAM windows
|
||||
Base::GlobalPrefetchAsync(
|
||||
a_copy_lds_window0, a_tile_windows[number<0>{}], a_dram_tile_window_step);
|
||||
Base::GlobalPrefetchAsync(
|
||||
b_copy_lds_window0, b_tile_windows[number<0>{}], b_dram_tile_window_step);
|
||||
|
||||
// Initialize block gemm and C block tile
|
||||
auto block_gemm = BlockGemm();
|
||||
auto c_block_tile = block_gemm.MakeCBlockTile();
|
||||
clear_tile(c_block_tile);
|
||||
|
||||
// read A(1), B(1) from DRAM to LDS window(1)
|
||||
// and advance the DRAM windows
|
||||
Base::GlobalPrefetchAsync(
|
||||
a_copy_lds_window1, a_tile_windows[number<0>{}], a_dram_tile_window_step);
|
||||
Base::GlobalPrefetchAsync(
|
||||
b_copy_lds_window1, b_tile_windows[number<0>{}], b_dram_tile_window_step);
|
||||
|
||||
// tile distribution for the register tiles
|
||||
constexpr auto ALdsTileDistr =
|
||||
make_static_tile_distribution(BlockGemm::MakeABlockDistributionEncode());
|
||||
constexpr auto BLdsTileDistr =
|
||||
make_static_tile_distribution(BlockGemm::MakeBBlockDistributionEncode());
|
||||
|
||||
using ALdsTile = decltype(make_static_distributed_tensor<ADataType>(ALdsTileDistr));
|
||||
using BLdsTile = decltype(make_static_distributed_tensor<BDataType>(BLdsTileDistr));
|
||||
|
||||
// register tiles; double buffering -> a register tile corresponds to a LDS tile window
|
||||
ALdsTile a_block_tile0, a_block_tile1;
|
||||
BLdsTile b_block_tile0, b_block_tile1;
|
||||
|
||||
// Some sanity checks on the LDS tile sizes
|
||||
static_assert(sizeof(ALdsTile) == MPerBlock *
|
||||
(KPerBlock * sizeof(ADataType) / APackedSize) *
|
||||
NWarp / BlockSize,
|
||||
"ALdsTile size is wrong!");
|
||||
static_assert(sizeof(BLdsTile) == NPerBlock *
|
||||
(KPerBlock * sizeof(BDataType) / BPackedSize) *
|
||||
MWarp / BlockSize,
|
||||
"BLdsTile size is wrong!");
|
||||
static_assert(Policy::template GetSmemSizeA<Problem>() ==
|
||||
MPerBlock * (KPerBlock * sizeof(ADataType) / APackedSize),
|
||||
"SmemSizeA size is wrong!");
|
||||
static_assert(Policy::template GetSmemSizeB<Problem>() ==
|
||||
(KPerBlock * sizeof(BDataType) / BPackedSize) * NPerBlock,
|
||||
"SmemSizeB size is wrong!");
|
||||
|
||||
////////////// MX Scale register tiles (ping-pong buffers) /////////////////
|
||||
// No packing needed - each thread gets e8m0_t elements directly
|
||||
// Each thread will cast e8m0_t to int32_t for WarpGemm with OpSel=0
|
||||
|
||||
using ScaleATileType = decltype(load_tile(scale_a_dram_window));
|
||||
using ScaleBTileType = decltype(load_tile(scale_b_dram_window));
|
||||
ScaleATileType scale_a_tile_ping, scale_a_tile_pong;
|
||||
ScaleBTileType scale_b_tile_ping, scale_b_tile_pong;
|
||||
|
||||
// initialize Scale DRAM window steps, used to advance the Scale DRAM windows
|
||||
using ScaleADramTileWindowStep = typename ScaleADramBlockWindowTmp::BottomTensorIndex;
|
||||
using ScaleBDramTileWindowStep = typename ScaleBDramBlockWindowTmp::BottomTensorIndex;
|
||||
constexpr ScaleADramTileWindowStep scale_a_dram_tile_window_step =
|
||||
make_array(0, ScaleKDimPerBlock);
|
||||
constexpr ScaleBDramTileWindowStep scale_b_dram_tile_window_step =
|
||||
make_array(0, ScaleKDimPerBlock);
|
||||
|
||||
// Helper function to load scales
|
||||
auto load_scales_from_dram = [&](auto& scale_a, auto& scale_b) {
|
||||
scale_a = load_tile(scale_a_dram_window);
|
||||
scale_b = load_tile(scale_b_dram_window);
|
||||
move_tile_window(scale_a_dram_window, scale_a_dram_tile_window_step);
|
||||
move_tile_window(scale_b_dram_window, scale_b_dram_tile_window_step);
|
||||
};
|
||||
|
||||
/// TODO: enable transpose
|
||||
// constexpr auto a_lds_input_tile_distr = [ALdsTileDistr]() {
|
||||
// if constexpr(is_a_load_tr_v)
|
||||
// return make_static_tile_distribution(
|
||||
// typename InputTileDistributionTraits<
|
||||
// typename decltype(ALdsTileDistr)::DstrEncode,
|
||||
// typename Problem::ADataType>::TransposedDstrEncode{});
|
||||
// else
|
||||
// return ALdsTileDistr;
|
||||
// }();
|
||||
// constexpr auto b_lds_input_tile_distr = [BLdsTileDistr]() {
|
||||
// if constexpr(is_b_load_tr_v)
|
||||
// return make_static_tile_distribution(
|
||||
// typename InputTileDistributionTraits<
|
||||
// typename decltype(BLdsTileDistr)::DstrEncode,
|
||||
// typename Problem::BDataType>::TransposedDstrEncode{});
|
||||
// else
|
||||
// return BLdsTileDistr;
|
||||
// }();
|
||||
|
||||
// LDS tile windows for reading;
|
||||
// they share the data pointer with the LDS windows for storing
|
||||
// but also associate with a distribution to produce a register tile when reading
|
||||
auto a_lds_ld_window0 =
|
||||
make_tile_window(a_lds_block0, a_lds_shape, {0, 0}, ALdsTileDistr);
|
||||
auto a_lds_ld_window1 =
|
||||
make_tile_window(a_lds_block1, a_lds_shape, {0, 0}, ALdsTileDistr);
|
||||
auto b_lds_ld_window0 =
|
||||
make_tile_window(b_lds_block0, b_lds_shape, {0, 0}, BLdsTileDistr);
|
||||
auto b_lds_ld_window1 =
|
||||
make_tile_window(b_lds_block1, b_lds_shape, {0, 0}, BLdsTileDistr);
|
||||
|
||||
static_assert(!(is_tile_window_linear_v<decltype(a_lds_ld_window0)>) &&
|
||||
!(is_tile_window_linear_v<decltype(a_lds_ld_window1)>) &&
|
||||
!(is_tile_window_linear_v<decltype(b_lds_ld_window0)>) &&
|
||||
!(is_tile_window_linear_v<decltype(b_lds_ld_window1)>),
|
||||
"LDS windows must not be linear");
|
||||
|
||||
// write to LDS window(0) must complete before the local prefetch
|
||||
block_sync_lds_direct_load();
|
||||
// read A(0), B(0) from LDS window(0) to pipeline registers(0)
|
||||
Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0, is_a_load_tr_v);
|
||||
Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0, is_b_load_tr_v);
|
||||
// LDS window(0) contents are overwritten below by global prefetch, need to sync
|
||||
block_sync_lds();
|
||||
// read A(2), B(2) from DRAM to LDS window(0)
|
||||
// and advance the DRAM windows
|
||||
Base::GlobalPrefetchAsync(
|
||||
a_copy_lds_window0, a_tile_windows[number<0>{}], a_dram_tile_window_step);
|
||||
Base::GlobalPrefetchAsync(
|
||||
b_copy_lds_window0, b_tile_windows[number<0>{}], b_dram_tile_window_step);
|
||||
|
||||
// Load scales for iteration 0 (ping)
|
||||
load_scales_from_dram(scale_a_tile_ping, scale_b_tile_ping);
|
||||
|
||||
// Load scales for iteration 1 (pong) if needed
|
||||
if(num_loop > 1)
|
||||
{
|
||||
load_scales_from_dram(scale_a_tile_pong, scale_b_tile_pong);
|
||||
}
|
||||
|
||||
if(HasHotLoop)
|
||||
{
|
||||
// we have had 3 global prefetches so far, indexed (0, 1, 2).
|
||||
index_t i_global_read = amd_wave_read_first_lane(3);
|
||||
// alternate ping: (read to register tile(1), use register tile(0) as gemm input)
|
||||
// pong: (read to register tile(0), use register tile(1) as gemm input)
|
||||
do
|
||||
{
|
||||
// ping
|
||||
{
|
||||
// read A(i-1), B(i-1) from LDS window(1) to pipeline registers(1)
|
||||
Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1, is_a_load_tr_v);
|
||||
Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1, is_b_load_tr_v);
|
||||
// LDS window(1) contents are overwritten by global prefetch, need to sync
|
||||
block_sync_lds();
|
||||
// read A(i), B(i) from DRAM to LDS window(1)
|
||||
// and advance the DRAM windows
|
||||
Base::GlobalPrefetchAsync(a_copy_lds_window1,
|
||||
a_tile_windows[number<0>{}],
|
||||
a_dram_tile_window_step);
|
||||
Base::GlobalPrefetchAsync(b_copy_lds_window1,
|
||||
b_tile_windows[number<0>{}],
|
||||
b_dram_tile_window_step);
|
||||
// C(i-3) = A(i-3) @ B(i-3) with MX scaling
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile0,
|
||||
b_block_tile0,
|
||||
scale_a_tile_ping,
|
||||
scale_b_tile_ping);
|
||||
HotLoopScheduler();
|
||||
// Load next scales after using current scales above
|
||||
load_scales_from_dram(scale_a_tile_ping, scale_b_tile_ping);
|
||||
}
|
||||
// pong
|
||||
{
|
||||
// write to LDS window(0) must complete before the local prefetch
|
||||
block_sync_lds_direct_load();
|
||||
// read A(i), B(i) from LDS window(0) to pipeline registers(0)
|
||||
Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0, is_a_load_tr_v);
|
||||
Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0, is_b_load_tr_v);
|
||||
// LDS window(0) contents are overwritten by global prefetch, need to sync
|
||||
block_sync_lds();
|
||||
// read A(i+1), B(i+1) from DRAM to LDS window(0)
|
||||
// and advance the DRAM windows
|
||||
Base::GlobalPrefetchAsync(a_copy_lds_window0,
|
||||
a_tile_windows[number<0>{}],
|
||||
a_dram_tile_window_step);
|
||||
Base::GlobalPrefetchAsync(b_copy_lds_window0,
|
||||
b_tile_windows[number<0>{}],
|
||||
b_dram_tile_window_step);
|
||||
// C(i-2) = A(i-2) @ B(i-2) with MX scaling
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile1,
|
||||
b_block_tile1,
|
||||
scale_a_tile_pong,
|
||||
scale_b_tile_pong);
|
||||
HotLoopScheduler();
|
||||
// Load next scales after using current scales above
|
||||
load_scales_from_dram(scale_a_tile_pong, scale_b_tile_pong);
|
||||
}
|
||||
i_global_read += 2;
|
||||
} while(i_global_read < num_loop);
|
||||
}
|
||||
|
||||
// 3 block gemms remaining
|
||||
if constexpr(TailNum == TailNumber::Three)
|
||||
{
|
||||
{
|
||||
// read A(num_loop-1), B(num_loop-1) from LDS window(1) to pipeline registers(1)
|
||||
Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1, is_a_load_tr_v);
|
||||
Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1, is_b_load_tr_v);
|
||||
// C(num_loop-2) = A(num_loop-2) @ B(num_loop-2) with MX scaling
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile0,
|
||||
b_block_tile0,
|
||||
scale_a_tile_ping,
|
||||
scale_b_tile_ping);
|
||||
|
||||
// load last scales to ping for the last iteration to ping buffers
|
||||
load_scales_from_dram(scale_a_tile_ping, scale_b_tile_ping);
|
||||
}
|
||||
{
|
||||
// write to LDS window(0) must complete before the local prefetch
|
||||
block_sync_lds_direct_load();
|
||||
// read A(num_loop), B(num_loop) from LDS window(0) to pipeline registers(0)
|
||||
Base::LocalPrefetch(a_block_tile0, a_lds_ld_window0, is_a_load_tr_v);
|
||||
Base::LocalPrefetch(b_block_tile0, b_lds_ld_window0, is_b_load_tr_v);
|
||||
// C(num_loop-1) = A(num_loop-1) @ B(num_loop-1) with MX scaling
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile1,
|
||||
b_block_tile1,
|
||||
scale_a_tile_pong,
|
||||
scale_b_tile_pong);
|
||||
}
|
||||
{
|
||||
// C(num_loop) = A(num_loop) @ B(num_loop) with MX scaling
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile0,
|
||||
b_block_tile0,
|
||||
scale_a_tile_ping,
|
||||
scale_b_tile_ping);
|
||||
}
|
||||
}
|
||||
else if(TailNum == TailNumber::Two)
|
||||
// 2 block gemms remaining
|
||||
{
|
||||
{
|
||||
// read A(num_loop), B(num_loop) from LDS window(1) to pipeline registers(1)
|
||||
Base::LocalPrefetch(a_block_tile1, a_lds_ld_window1, is_a_load_tr_v);
|
||||
Base::LocalPrefetch(b_block_tile1, b_lds_ld_window1, is_b_load_tr_v);
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile0,
|
||||
b_block_tile0,
|
||||
scale_a_tile_ping,
|
||||
scale_b_tile_ping);
|
||||
}
|
||||
{
|
||||
// C(num_loop) = A(num_loop) @ B(num_loop) with MX scaling
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile1,
|
||||
b_block_tile1,
|
||||
scale_a_tile_pong,
|
||||
scale_b_tile_pong);
|
||||
}
|
||||
}
|
||||
else if(TailNum == TailNumber::One)
|
||||
{
|
||||
block_sync_lds();
|
||||
// C(num_loop) = A(num_loop) @ B(num_loop) with MX scaling
|
||||
block_gemm(c_block_tile,
|
||||
a_block_tile0,
|
||||
b_block_tile0,
|
||||
scale_a_tile_ping,
|
||||
scale_b_tile_ping);
|
||||
__builtin_amdgcn_sched_barrier(0);
|
||||
}
|
||||
|
||||
return c_block_tile;
|
||||
}
|
||||
};
|
||||
|
||||
template <typename ADramBlockWindowTmp,
|
||||
typename BDramBlockWindowTmp,
|
||||
typename ScaleADramBlockWindowTmp,
|
||||
typename ScaleBDramBlockWindowTmp,
|
||||
typename AElementFunction,
|
||||
typename BElementFunction>
|
||||
CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
|
||||
const AElementFunction& a_element_func,
|
||||
const BDramBlockWindowTmp& b_dram_block_window_tmp,
|
||||
const BElementFunction& b_element_func,
|
||||
const ScaleADramBlockWindowTmp& scale_a_window,
|
||||
const ScaleBDramBlockWindowTmp& scale_b_window,
|
||||
index_t num_loop,
|
||||
void* __restrict__ p_smem_0,
|
||||
void* __restrict__ p_smem_1) const
|
||||
{
|
||||
const bool has_hot_loop = Base::BlockHasHotloop(num_loop);
|
||||
const auto tail_number = Base::GetBlockLoopTailNum(num_loop);
|
||||
|
||||
const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
|
||||
return PipelineImpl<Scheduler>{}.template operator()<hot_loop_.value, tail_num_.value>(
|
||||
a_dram_block_window_tmp,
|
||||
a_element_func,
|
||||
b_dram_block_window_tmp,
|
||||
b_element_func,
|
||||
scale_a_window,
|
||||
scale_b_window,
|
||||
num_loop,
|
||||
p_smem_0,
|
||||
p_smem_1);
|
||||
};
|
||||
|
||||
return Base::TailHandler(RunPipeline, has_hot_loop, tail_number);
|
||||
}
|
||||
|
||||
public:
|
||||
template <typename ADramBlockWindowTmp,
|
||||
typename BDramBlockWindowTmp,
|
||||
typename ScaleADramBlockWindowTmp,
|
||||
typename ScaleBDramBlockWindowTmp>
|
||||
CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
|
||||
const BDramBlockWindowTmp& b_dram_block_window_tmp,
|
||||
const ScaleADramBlockWindowTmp& scale_a_window,
|
||||
const ScaleBDramBlockWindowTmp& scale_b_window,
|
||||
const index_t num_loop,
|
||||
void* __restrict__ p_smem_0,
|
||||
void* __restrict__ p_smem_1) const
|
||||
{
|
||||
const bool has_hot_loop = Base::BlockHasHotloop(num_loop);
|
||||
const auto tail_number = Base::GetBlockLoopTailNum(num_loop);
|
||||
|
||||
const auto RunPipeline = [&](auto hot_loop_, auto tail_num_) {
|
||||
return PipelineImpl<Scheduler>{}.template operator()<hot_loop_.value, tail_num_.value>(
|
||||
make_tuple(a_dram_block_window_tmp),
|
||||
element_wise::PassThrough{},
|
||||
make_tuple(b_dram_block_window_tmp),
|
||||
element_wise::PassThrough{},
|
||||
scale_a_window,
|
||||
scale_b_window,
|
||||
num_loop,
|
||||
p_smem_0,
|
||||
p_smem_1);
|
||||
};
|
||||
|
||||
return Base::TailHandler(RunPipeline, has_hot_loop, tail_number);
|
||||
}
|
||||
};
|
||||
} // namespace ck_tile
|
||||
@@ -0,0 +1,195 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/core/arch/arch.hpp"
|
||||
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
|
||||
#include "ck_tile/ops/common/tensor_layout.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
|
||||
#include <type_traits>
|
||||
|
||||
namespace ck_tile {
|
||||
// Default policy for MXGemmPipelineAgBgCrCompAsync
|
||||
// Customized methods: MakeALdsBlockDescriptor, MakeBLdsBlockDescriptor
|
||||
// GetBlockGemm implementation is copied from GemmPipelineAgBgCrCompV4DefaultPolicy
|
||||
// Adds MX scale tile distributions
|
||||
struct MXGemmPipelineAgBgCrCompAsyncDefaultPolicy
|
||||
: public UniversalGemmBasePolicy<MXGemmPipelineAgBgCrCompAsyncDefaultPolicy>
|
||||
{
|
||||
static constexpr auto ATileAccessPattern = tile_distribution_pattern::warp_raked;
|
||||
static constexpr auto BTileAccessPattern = tile_distribution_pattern::warp_raked;
|
||||
|
||||
// MX scaling configuration: each e8m0 scale covers 32 elements in K
|
||||
static constexpr int BlockScaleSize = 32;
|
||||
|
||||
template <typename Problem,
|
||||
typename OverrideADataType = remove_cvref_t<typename Problem::ADataType>>
|
||||
CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
|
||||
{
|
||||
constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
|
||||
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
|
||||
if constexpr(is_a_load_tr<Problem>)
|
||||
{
|
||||
// TODO: better LDS descriptor for performance
|
||||
constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor( //
|
||||
make_tuple(number<KPerBlock>{}, number<MPerBlock>{}),
|
||||
make_tuple(number<MPerBlock>{}, number<1>{}),
|
||||
number<MPerBlock>{},
|
||||
number<1>{});
|
||||
return a_lds_block_desc_0;
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr index_t KPack = GetSmemPackA<Problem>();
|
||||
|
||||
constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
|
||||
make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
|
||||
make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
|
||||
number<KPack>{},
|
||||
number<1>{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
a_lds_block_desc_0,
|
||||
make_tuple(
|
||||
make_pass_through_transform(number<MPerBlock>{}),
|
||||
make_merge_transform(make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
|
||||
make_tuple(sequence<1>{}, sequence<0, 2>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
|
||||
{
|
||||
constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
|
||||
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
|
||||
if constexpr(is_b_load_tr<Problem>)
|
||||
{
|
||||
// TODO: better LDS descriptor for performance
|
||||
constexpr auto b_lds_block_desc_0 =
|
||||
make_naive_tensor_descriptor(make_tuple(number<KPerBlock>{}, number<NPerBlock>{}),
|
||||
make_tuple(number<NPerBlock>{}, number<1>{}),
|
||||
number<NPerBlock>{},
|
||||
number<1>{});
|
||||
return b_lds_block_desc_0;
|
||||
}
|
||||
else
|
||||
{
|
||||
constexpr index_t KPack = GetSmemPackB<Problem>();
|
||||
|
||||
constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
|
||||
make_tuple(number<KPerBlock / KPack>{}, number<NPerBlock>{}, number<KPack>{}),
|
||||
make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
|
||||
number<KPack>{},
|
||||
number<1>{});
|
||||
|
||||
return transform_tensor_descriptor(
|
||||
b_lds_block_desc_0,
|
||||
make_tuple(
|
||||
make_pass_through_transform(number<NPerBlock>{}),
|
||||
make_merge_transform(make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
|
||||
make_tuple(sequence<1>{}, sequence<0, 2>{}),
|
||||
make_tuple(sequence<0>{}, sequence<1>{}));
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_HOST_DEVICE static constexpr auto GetBlockGemm()
|
||||
{
|
||||
using BlockWarps = typename Problem::BlockGemmShape::BlockWarps;
|
||||
using WarpTile = typename Problem::BlockGemmShape::WarpTile;
|
||||
|
||||
using ADataType = typename Problem::ADataType;
|
||||
using BDataType = typename Problem::BDataType;
|
||||
using CDataType = typename Problem::CDataType;
|
||||
|
||||
// FP4 and FP8 require different layouts for the scaled mfma instructions
|
||||
constexpr auto wg_attr_num_access =
|
||||
(std::is_same_v<ADataType, fp8_t> || std::is_same_v<BDataType, fp8_t>)
|
||||
? WGAttrNumAccessEnum::Double
|
||||
: WGAttrNumAccessEnum::Single;
|
||||
|
||||
using WarpGemm = WarpGemmDispatcher<ADataType,
|
||||
BDataType,
|
||||
CDataType, // AccDataType
|
||||
WarpTile::at(I0),
|
||||
WarpTile::at(I1),
|
||||
WarpTile::at(I2),
|
||||
Problem::TransposeC,
|
||||
false,
|
||||
false,
|
||||
wg_attr_num_access>;
|
||||
|
||||
using BlockGemmPolicy = BlockGemmARegBRegCRegV1CustomPolicy<ADataType,
|
||||
BDataType,
|
||||
CDataType,
|
||||
BlockWarps,
|
||||
WarpGemm>;
|
||||
|
||||
return BlockGemmARegBRegCRegV1<Problem, BlockGemmPolicy>{};
|
||||
}
|
||||
|
||||
// MX Scale tile distributions for loading from global memory
|
||||
template <typename Problem>
|
||||
CK_TILE_HOST_DEVICE static constexpr auto MakeMX_ScaleA_DramTileDistribution()
|
||||
{
|
||||
using BlockGemmShape = typename Problem::BlockGemmShape;
|
||||
using BlockWarps = typename BlockGemmShape::BlockWarps;
|
||||
using WarpTile = typename BlockGemmShape::WarpTile;
|
||||
|
||||
constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
|
||||
constexpr index_t MWarp = BlockWarps::at(number<0>{});
|
||||
constexpr index_t NWarp = BlockWarps::at(number<1>{});
|
||||
constexpr index_t MPerXdl = WarpTile::at(number<0>{});
|
||||
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
|
||||
|
||||
constexpr index_t K_Lane = get_warp_size() / MPerXdl; // 64/16 = 4 threads in K dimension
|
||||
constexpr index_t MIterPerWarp = MPerBlock / (MWarp * MPerXdl);
|
||||
constexpr index_t KPerXdl = WarpTile::at(number<2>{});
|
||||
constexpr index_t KIterPerWarp = KPerBlock / KPerXdl;
|
||||
constexpr index_t KPerLane = KPerXdl / BlockScaleSize / K_Lane;
|
||||
|
||||
return make_static_tile_distribution(
|
||||
tile_distribution_encoding<
|
||||
sequence<NWarp>, // repeat over MWarps
|
||||
tuple<sequence<MIterPerWarp, MWarp, MPerXdl>, // M dimension (first)
|
||||
sequence<KIterPerWarp, K_Lane, KPerLane>>, // K dimension (second)
|
||||
tuple<sequence<0, 1>, sequence<2, 1>>, // <MWarp, NWarp>, <K_Lane, MPerXdl>
|
||||
tuple<sequence<0, 1>, sequence<1, 2>>,
|
||||
sequence<2, 1, 2>, // <KIterPerWarp, MIterPerWarp, KPerLane>
|
||||
sequence<0, 0, 2>>{});
|
||||
}
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_HOST_DEVICE static constexpr auto MakeMX_ScaleB_DramTileDistribution()
|
||||
{
|
||||
using BlockGemmShape = typename Problem::BlockGemmShape;
|
||||
using BlockWarps = typename BlockGemmShape::BlockWarps;
|
||||
using WarpTile = typename BlockGemmShape::WarpTile;
|
||||
|
||||
constexpr index_t NPerBlock = Problem::BlockGemmShape::kN;
|
||||
constexpr index_t MWarp = BlockWarps::at(number<0>{});
|
||||
constexpr index_t NWarp = BlockWarps::at(number<1>{});
|
||||
constexpr index_t NPerXdl = WarpTile::at(number<1>{});
|
||||
constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
|
||||
constexpr index_t K_Lane = get_warp_size() / NPerXdl; // 64/16 = 4 threads in K dimension
|
||||
constexpr index_t NIterPerWarp = NPerBlock / (NWarp * NPerXdl);
|
||||
|
||||
constexpr index_t KPerXdl = WarpTile::at(number<2>{});
|
||||
constexpr index_t KIterPerWarp = KPerBlock / KPerXdl;
|
||||
constexpr index_t KPerLane = KPerXdl / BlockScaleSize / K_Lane;
|
||||
|
||||
return make_static_tile_distribution(
|
||||
tile_distribution_encoding<
|
||||
sequence<MWarp>, // repeat over MWarps
|
||||
tuple<sequence<NIterPerWarp, NWarp, NPerXdl>, // N dimension (first)
|
||||
sequence<KIterPerWarp, K_Lane, KPerLane>>, // K dimension (second)
|
||||
tuple<sequence<0, 1>, sequence<2, 1>>, // <MWarp, NWarp>, <K_Lane, MPerXdl>
|
||||
tuple<sequence<0, 1>, sequence<1, 2>>,
|
||||
sequence<2, 1, 2>, // <KIterPerWarp, NIterPerWarp, KPerLane>
|
||||
sequence<0, 0, 2>>{});
|
||||
}
|
||||
};
|
||||
} // namespace ck_tile
|
||||
@@ -127,7 +127,12 @@ struct ABQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Pro
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
|
||||
{
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
// We are not storing the original packed type in LDS, so we need to multiply the smem size
|
||||
// by the packed size.
|
||||
constexpr index_t smem_size_a = Policy::template GetSmemSizeA<Problem>() * APackedSize;
|
||||
constexpr index_t smem_size_b = Policy::template GetSmemSizeB<Problem>() * BPackedSize;
|
||||
|
||||
return smem_size_a + smem_size_b;
|
||||
}
|
||||
|
||||
CK_TILE_HOST static std::string Print()
|
||||
|
||||
@@ -106,7 +106,12 @@ struct AQuantGemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
|
||||
{
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
// We are not storing the original packed type in LDS, so we need to multiply the smem size
|
||||
// by the packed size.
|
||||
constexpr index_t smem_size_a = Policy::template GetSmemSizeA<Problem>() * APackedSize;
|
||||
constexpr index_t smem_size_b = Policy::template GetSmemSizeB<Problem>() * BPackedSize;
|
||||
|
||||
return smem_size_a + smem_size_b;
|
||||
}
|
||||
|
||||
CK_TILE_HOST static std::string Print()
|
||||
|
||||
@@ -102,7 +102,12 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Prob
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
|
||||
{
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
// We are not storing the original packed type in LDS, so we need to multiply the smem size
|
||||
// by the packed size.
|
||||
constexpr index_t smem_size_a = Policy::template GetSmemSizeA<Problem>() * APackedSize;
|
||||
constexpr index_t smem_size_b = Policy::template GetSmemSizeB<Problem>() * BPackedSize;
|
||||
|
||||
return smem_size_a + smem_size_b;
|
||||
}
|
||||
|
||||
CK_TILE_HOST static std::string Print()
|
||||
|
||||
@@ -115,7 +115,12 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Prob
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
|
||||
{
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
// We are not storing the original packed type in LDS, so we need to multiply the smem size
|
||||
// by the packed size.
|
||||
constexpr index_t smem_size_a = Policy::template GetSmemSizeA<Problem>() * APackedSize;
|
||||
constexpr index_t smem_size_b = Policy::template GetSmemSizeB<Problem>() * BPackedSize;
|
||||
|
||||
return smem_size_a + smem_size_b;
|
||||
}
|
||||
|
||||
CK_TILE_HOST static std::string Print()
|
||||
|
||||
@@ -112,7 +112,10 @@ struct MicroscaleGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<
|
||||
|
||||
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
|
||||
{
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
constexpr index_t smem_size_a = Policy::template GetSmemSizeA<Problem>();
|
||||
constexpr index_t smem_size_b = Policy::template GetSmemSizeB<Problem>();
|
||||
|
||||
return smem_size_a + smem_size_b;
|
||||
}
|
||||
|
||||
CK_TILE_HOST static std::string Print()
|
||||
|
||||
@@ -57,6 +57,7 @@ add_subdirectory(add_rmsnorm2d_rdquant)
|
||||
# add_subdirectory(layernorm2d)
|
||||
# add_subdirectory(rmsnorm2d)
|
||||
add_subdirectory(gemm_block_scale)
|
||||
add_subdirectory(gemm_mx)
|
||||
add_subdirectory(utility)
|
||||
add_subdirectory(warp_gemm)
|
||||
add_subdirectory(reduce)
|
||||
|
||||
@@ -20,3 +20,21 @@ TYPED_TEST_SUITE(TEST_SUITE_NAME, KernelTypesCompAsync);
|
||||
#include "test_gemm_pipeline_ut_cases.inc"
|
||||
|
||||
#undef TEST_SUITE_NAME
|
||||
|
||||
template <typename T>
|
||||
class TestCkTileGemmPipelineCompAsync16x16x128
|
||||
: public TestCkTileGemmPipeline<T, TestCkTileGemmPipelineCompAsync16x16x128<T>>
|
||||
{
|
||||
public:
|
||||
static constexpr bool check_data_type() { return true; }
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE(TestCkTileGemmPipelineCompAsync16x16x128, KernelTypesCompAsync16x16x128);
|
||||
TYPED_TEST(TestCkTileGemmPipelineCompAsync16x16x128, QuickTest)
|
||||
{
|
||||
constexpr int M = 1024;
|
||||
constexpr int N = 1024;
|
||||
constexpr int K = 1024;
|
||||
|
||||
this->template RunSingle<false, false, false, false>(M, N, K, 0, 0, 0, 1);
|
||||
}
|
||||
|
||||
@@ -29,6 +29,7 @@ using NonPersistent = std::false_type;
|
||||
using I16 = ck_tile::number<16>;
|
||||
using I32 = ck_tile::number<32>;
|
||||
using I64 = ck_tile::number<64>;
|
||||
using I128 = ck_tile::number<128>;
|
||||
using I256 = ck_tile::number<256>;
|
||||
|
||||
// clang-format off
|
||||
@@ -224,6 +225,23 @@ using CompAsyncConfig = std::tuple<ALayout,
|
||||
Intrawave,
|
||||
CompAsync>;
|
||||
|
||||
template <typename ALayout, typename BLayout, typename CLayout, typename InputType>
|
||||
using CompAsyncConfig16x16x128 = std::tuple<ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
InputType, // AType
|
||||
InputType, // BType
|
||||
F32, // AccType
|
||||
F16, // OutputType
|
||||
I64, // MBlockTileSize
|
||||
I64, // NBlockTileSize
|
||||
I128, // KBlockTileSize
|
||||
I16, // MWarpTileSize
|
||||
I16, // NWarpTileSize
|
||||
I128, // KWarpTileSize
|
||||
Intrawave,
|
||||
CompAsync>;
|
||||
|
||||
using KernelTypesCompAsync = ::testing::Types<CompAsyncConfig<Row, Row, Row, F16>,
|
||||
CompAsyncConfig<Row, Col, Row, F16>,
|
||||
CompAsyncConfig<Col, Row, Row, F16>,
|
||||
@@ -232,6 +250,10 @@ using KernelTypesCompAsync = ::testing::Types<CompAsyncConfig<Row, Row, Row, F16
|
||||
CompAsyncConfig<Row, Col, Row, F8>,
|
||||
CompAsyncConfig<Col, Row, Row, F8>,
|
||||
CompAsyncConfig<Col, Col, Row, F8>>;
|
||||
|
||||
using KernelTypesCompAsync16x16x128 = ::testing::Types<CompAsyncConfig16x16x128<Row, Col, Row, F4>,
|
||||
CompAsyncConfig16x16x128<Row, Col, Row, F8>>;
|
||||
|
||||
// clang-format off
|
||||
|
||||
using KernelTypesCompV6 = ::testing::Types<
|
||||
|
||||
@@ -7,6 +7,7 @@ using INT32 = ck_tile::int32_t;
|
||||
using F16 = ck_tile::half_t;
|
||||
using F32 = float;
|
||||
using F8 = ck_tile::fp8_t;
|
||||
using F4 = ck_tile::pk_fp4_t;
|
||||
|
||||
using BF16 = ck_tile::bf16_t;
|
||||
using BF8 = ck_tile::bf8_t;
|
||||
|
||||
17
test/ck_tile/gemm_mx/CMakeLists.txt
Normal file
17
test/ck_tile/gemm_mx/CMakeLists.txt
Normal file
@@ -0,0 +1,17 @@
|
||||
# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
# SPDX-License-Identifier: MIT
|
||||
|
||||
set(TEST_MX_GEMM_COMPILE_OPTIONS -Wno-undefined-func-template)
|
||||
if(CK_USE_OCP_FP8)
|
||||
list(APPEND TEST_MX_GEMM_COMPILE_OPTIONS -DCK_TILE_USE_OCP_FP8)
|
||||
endif()
|
||||
|
||||
if(GPU_TARGETS MATCHES "gfx95")
|
||||
add_gtest_executable(test_ck_tile_mx_gemm_fp4 test_mx_gemm_fp4.cpp)
|
||||
target_compile_options(test_ck_tile_mx_gemm_fp4 PRIVATE ${TEST_MX_GEMM_COMPILE_OPTIONS})
|
||||
|
||||
add_gtest_executable(test_ck_tile_mx_gemm_fp8 test_mx_gemm_fp8.cpp)
|
||||
target_compile_options(test_ck_tile_mx_gemm_fp8 PRIVATE ${TEST_MX_GEMM_COMPILE_OPTIONS})
|
||||
else()
|
||||
message(DEBUG "Skipping ck_tile MX GEMM tests for current target")
|
||||
endif()
|
||||
95
test/ck_tile/gemm_mx/test_mx_gemm_config.hpp
Normal file
95
test/ck_tile/gemm_mx/test_mx_gemm_config.hpp
Normal file
@@ -0,0 +1,95 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/kernel/scale_pointer.hpp"
|
||||
|
||||
template <typename ScaleM, typename ScaleN>
|
||||
struct MXGemmHostArgs : ck_tile::UniversalGemmHostArgs<1, 1, 0>
|
||||
{
|
||||
using Base = ck_tile::UniversalGemmHostArgs<1, 1, 0>;
|
||||
|
||||
MXGemmHostArgs(const void* a_ptr,
|
||||
const void* b_ptr,
|
||||
void* c_ptr_,
|
||||
ck_tile::index_t k_batch_,
|
||||
ck_tile::index_t M_,
|
||||
ck_tile::index_t N_,
|
||||
ck_tile::index_t K_,
|
||||
ck_tile::index_t stride_A_,
|
||||
ck_tile::index_t stride_B_,
|
||||
ck_tile::index_t stride_C_,
|
||||
ScaleM scale_m_,
|
||||
ScaleN scale_n_)
|
||||
: Base({a_ptr},
|
||||
{b_ptr},
|
||||
{},
|
||||
c_ptr_,
|
||||
k_batch_,
|
||||
M_,
|
||||
N_,
|
||||
K_,
|
||||
{stride_A_},
|
||||
{stride_B_},
|
||||
{},
|
||||
stride_C_),
|
||||
scale_m(scale_m_),
|
||||
scale_n(scale_n_)
|
||||
{
|
||||
}
|
||||
|
||||
ScaleM scale_m;
|
||||
ScaleN scale_n;
|
||||
};
|
||||
|
||||
struct MxGemmConfig
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 128;
|
||||
static constexpr ck_tile::index_t N_Tile = 128;
|
||||
static constexpr ck_tile::index_t K_Tile = 512;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp = 1;
|
||||
static constexpr ck_tile::index_t N_Warp = 4;
|
||||
static constexpr ck_tile::index_t K_Warp = 1;
|
||||
|
||||
static constexpr ck_tile::index_t M_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t N_Warp_Tile = 16;
|
||||
static constexpr ck_tile::index_t K_Warp_Tile = 128;
|
||||
|
||||
static constexpr bool kPadM = false;
|
||||
static constexpr bool kPadN = false;
|
||||
static constexpr bool kPadK = false;
|
||||
|
||||
static constexpr bool TransposeC = false;
|
||||
static constexpr bool UseStructuredSparsity = false;
|
||||
|
||||
static constexpr int kBlockPerCu = 1;
|
||||
static constexpr int TileParitionerGroupNum = 8;
|
||||
static constexpr int TileParitionerM01 = 4;
|
||||
static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave;
|
||||
static constexpr ck_tile::index_t NumWaveGroups = 1;
|
||||
static constexpr bool DoubleSmemBuffer = false;
|
||||
static constexpr bool Preshuffle = false;
|
||||
|
||||
static constexpr int N_Repeat = N_Tile / N_Warp_Tile / N_Warp;
|
||||
static constexpr bool TiledMMAPermuteN = false;
|
||||
};
|
||||
|
||||
struct MXfp4_GemmConfig16 : MxGemmConfig
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 64;
|
||||
static constexpr ck_tile::index_t N_Tile = 64;
|
||||
static constexpr ck_tile::index_t K_Tile = 256;
|
||||
};
|
||||
|
||||
struct MXfp8_GemmConfig16 : MxGemmConfig
|
||||
{
|
||||
static constexpr ck_tile::index_t M_Tile = 64;
|
||||
static constexpr ck_tile::index_t N_Tile = 64;
|
||||
static constexpr ck_tile::index_t K_Tile = 256;
|
||||
};
|
||||
30
test/ck_tile/gemm_mx/test_mx_gemm_fp4.cpp
Normal file
30
test/ck_tile/gemm_mx/test_mx_gemm_fp4.cpp
Normal file
@@ -0,0 +1,30 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "test_mx_gemm_config.hpp"
|
||||
#include "test_mx_gemm_util.hpp"
|
||||
|
||||
using Row = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using MxFp4Types = ::testing::Types<
|
||||
std::tuple<ck_tile::pk_fp4_t, ck_tile::pk_fp4_t, MXfp4_GemmConfig16, Row, Col, Row>>;
|
||||
|
||||
template <typename TypeParam>
|
||||
class TestMxGemmFp4 : public TestMxGemmUtil<std::tuple_element_t<0, TypeParam>,
|
||||
std::tuple_element_t<1, TypeParam>,
|
||||
std::tuple_element_t<2, TypeParam>,
|
||||
std::tuple_element_t<3, TypeParam>,
|
||||
std::tuple_element_t<4, TypeParam>,
|
||||
std::tuple_element_t<5, TypeParam>>
|
||||
{
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE(TestMxGemmFp4, MxFp4Types);
|
||||
|
||||
TYPED_TEST(TestMxGemmFp4, BasicSizes)
|
||||
{
|
||||
this->Run(64, 64, 256);
|
||||
this->Run(128, 128, 256);
|
||||
this->Run(64, 128, 512);
|
||||
}
|
||||
30
test/ck_tile/gemm_mx/test_mx_gemm_fp8.cpp
Normal file
30
test/ck_tile/gemm_mx/test_mx_gemm_fp8.cpp
Normal file
@@ -0,0 +1,30 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#include "test_mx_gemm_config.hpp"
|
||||
#include "test_mx_gemm_util.hpp"
|
||||
|
||||
using Row = ck_tile::tensor_layout::gemm::RowMajor;
|
||||
using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
|
||||
|
||||
using MxFp8Types =
|
||||
::testing::Types<std::tuple<ck_tile::fp8_t, ck_tile::fp8_t, MXfp8_GemmConfig16, Row, Col, Row>>;
|
||||
|
||||
template <typename TypeParam>
|
||||
class TestMxGemmFp8 : public TestMxGemmUtil<std::tuple_element_t<0, TypeParam>,
|
||||
std::tuple_element_t<1, TypeParam>,
|
||||
std::tuple_element_t<2, TypeParam>,
|
||||
std::tuple_element_t<3, TypeParam>,
|
||||
std::tuple_element_t<4, TypeParam>,
|
||||
std::tuple_element_t<5, TypeParam>>
|
||||
{
|
||||
};
|
||||
|
||||
TYPED_TEST_SUITE(TestMxGemmFp8, MxFp8Types);
|
||||
|
||||
TYPED_TEST(TestMxGemmFp8, BasicSizes)
|
||||
{
|
||||
this->Run(64, 64, 256);
|
||||
this->Run(128, 128, 256);
|
||||
this->Run(64, 128, 512);
|
||||
}
|
||||
97
test/ck_tile/gemm_mx/test_mx_gemm_instance.hpp
Normal file
97
test/ck_tile/gemm_mx/test_mx_gemm_instance.hpp
Normal file
@@ -0,0 +1,97 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp"
|
||||
#include "ck_tile/ops/gemm_mx/kernel/gemm_mx_kernel.hpp"
|
||||
#include "test_mx_gemm_config.hpp"
|
||||
|
||||
template <typename GemmConfig,
|
||||
typename ADataType,
|
||||
typename BDataType,
|
||||
typename AccDataType,
|
||||
typename CDataType,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout,
|
||||
typename ScaleM,
|
||||
typename ScaleN,
|
||||
bool persistent,
|
||||
bool Splitk>
|
||||
float mx_gemm_calc(const MXGemmHostArgs<ScaleM, ScaleN>& args, const ck_tile::stream_config& s)
|
||||
{
|
||||
using GemmShape = ck_tile::TileGemmShape<
|
||||
ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
|
||||
ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
|
||||
ck_tile::
|
||||
sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>>;
|
||||
|
||||
using MXGemmTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
|
||||
GemmConfig::kPadN,
|
||||
GemmConfig::kPadK,
|
||||
GemmConfig::DoubleSmemBuffer,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
GemmConfig::TransposeC,
|
||||
GemmConfig::UseStructuredSparsity,
|
||||
persistent,
|
||||
GemmConfig::NumWaveGroups,
|
||||
GemmConfig::Preshuffle>;
|
||||
|
||||
using MXPipelineProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
MXGemmTraits,
|
||||
GemmConfig::Scheduler>;
|
||||
|
||||
using MXGemmPipeline = ck_tile::MXGemmPipelineAgBgCrCompAsync<MXPipelineProblem>;
|
||||
|
||||
using TilePartitioner =
|
||||
ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
|
||||
GemmConfig::TileParitionerGroupNum,
|
||||
GemmConfig::TileParitionerM01>;
|
||||
|
||||
using GemmEpilogue = ck_tile::CShuffleEpilogue<
|
||||
ck_tile::CShuffleEpilogueProblem<ADataType,
|
||||
BDataType,
|
||||
ck_tile::tuple<>,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
ck_tile::tuple<>,
|
||||
CLayout,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
TilePartitioner::MPerBlock,
|
||||
TilePartitioner::NPerBlock,
|
||||
GemmConfig::M_Warp,
|
||||
GemmConfig::N_Warp,
|
||||
GemmConfig::M_Warp_Tile,
|
||||
GemmConfig::N_Warp_Tile,
|
||||
GemmConfig::K_Warp_Tile,
|
||||
MXPipelineProblem::TransposeC>>;
|
||||
|
||||
using Kernel = ck_tile::MXGemmKernel<TilePartitioner, MXGemmPipeline, GemmEpilogue>;
|
||||
|
||||
auto kargs = Kernel::MakeKernelArgs(std::array<const void*, 1>{args.as_ptr},
|
||||
std::array<const void*, 1>{args.bs_ptr},
|
||||
std::array<const void*, 0>{},
|
||||
args.e_ptr,
|
||||
args.k_batch,
|
||||
args.M,
|
||||
args.N,
|
||||
args.K,
|
||||
std::array<ck_tile::index_t, 1>{args.stride_As},
|
||||
std::array<ck_tile::index_t, 1>{args.stride_Bs},
|
||||
std::array<ck_tile::index_t, 0>{},
|
||||
args.stride_E,
|
||||
args.scale_m,
|
||||
args.scale_n);
|
||||
|
||||
const auto kernel = ck_tile::make_kernel<Kernel::kBlockPerCu>(
|
||||
Kernel{}, Kernel::GridSize(kargs), Kernel::BlockSize(), 0, kargs);
|
||||
|
||||
return ck_tile::launch_kernel(s, kernel);
|
||||
}
|
||||
137
test/ck_tile/gemm_mx/test_mx_gemm_util.hpp
Normal file
137
test/ck_tile/gemm_mx/test_mx_gemm_util.hpp
Normal file
@@ -0,0 +1,137 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host.hpp"
|
||||
#include "ck_tile/host/check_err.hpp"
|
||||
#include "ck_tile/host/reference/reference_gemm.hpp"
|
||||
#include "test_mx_gemm_config.hpp"
|
||||
#include "test_mx_gemm_instance.hpp"
|
||||
|
||||
template <typename Layout>
|
||||
static constexpr auto is_row_major(Layout)
|
||||
{
|
||||
return ck_tile::bool_constant<
|
||||
std::is_same_v<ck_tile::remove_cvref_t<Layout>, ck_tile::tensor_layout::gemm::RowMajor>>{};
|
||||
}
|
||||
|
||||
template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
|
||||
auto calculate_rtol_atol_mx(ck_tile::index_t K, float max_accumulated_value)
|
||||
{
|
||||
using ComputeType =
|
||||
std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
|
||||
const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(K);
|
||||
const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
|
||||
max_accumulated_value, K);
|
||||
return ck_tile::make_tuple(rtol, atol);
|
||||
}
|
||||
|
||||
template <typename ADataType,
|
||||
typename BDataType,
|
||||
typename GemmConfig,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename CLayout>
|
||||
class TestMxGemmUtil : public ::testing::Test
|
||||
{
|
||||
protected:
|
||||
using AccDataType = float;
|
||||
using CDataType = ck_tile::fp16_t;
|
||||
using ScaleType = ck_tile::e8m0_t;
|
||||
using ScaleM = ck_tile::MXScalePointer<ScaleType, 1, 32>;
|
||||
using ScaleN = ck_tile::MXScalePointer<ScaleType, 1, 32>;
|
||||
|
||||
void Run(ck_tile::index_t M, ck_tile::index_t N, ck_tile::index_t K, int seed = 1234)
|
||||
{
|
||||
const ck_tile::index_t scale_k_size = K / 32;
|
||||
const ck_tile::index_t stride_A =
|
||||
ck_tile::get_default_stride(M, K, 0, is_row_major(ALayout{}));
|
||||
const ck_tile::index_t stride_B =
|
||||
ck_tile::get_default_stride(K, N, 0, is_row_major(BLayout{}));
|
||||
const ck_tile::index_t stride_C =
|
||||
ck_tile::get_default_stride(M, N, 0, is_row_major(CLayout{}));
|
||||
const ck_tile::index_t stride_scale_a =
|
||||
ck_tile::get_default_stride(M, scale_k_size, 0, is_row_major(ALayout{}));
|
||||
const ck_tile::index_t stride_scale_b =
|
||||
ck_tile::get_default_stride(scale_k_size, N, 0, is_row_major(BLayout{}));
|
||||
|
||||
ck_tile::HostTensor<ADataType> a_host(
|
||||
ck_tile::host_tensor_descriptor(M, K, stride_A, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<BDataType> b_host(
|
||||
ck_tile::host_tensor_descriptor(K, N, stride_B, is_row_major(BLayout{})));
|
||||
ck_tile::HostTensor<CDataType> c_host(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
ck_tile::HostTensor<ScaleType> scale_a_host(ck_tile::host_tensor_descriptor(
|
||||
M, scale_k_size, stride_scale_a, is_row_major(ALayout{})));
|
||||
ck_tile::HostTensor<ScaleType> scale_b_host(ck_tile::host_tensor_descriptor(
|
||||
scale_k_size, N, stride_scale_b, is_row_major(BLayout{})));
|
||||
|
||||
ck_tile::FillUniformDistribution<ADataType>{-2.f, 2.f, seed++}(a_host);
|
||||
ck_tile::FillUniformDistribution<BDataType>{-2.f, 2.f, seed++}(b_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{0.001f, 10.f, seed++}(scale_a_host);
|
||||
ck_tile::FillUniformDistribution<ScaleType>{0.001f, 10.f, seed++}(scale_b_host);
|
||||
|
||||
ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem b_dev_buf(b_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem c_dev_buf(c_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem scale_a_dev_buf(scale_a_host.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem scale_b_dev_buf(scale_b_host.get_element_space_size_in_bytes());
|
||||
|
||||
a_dev_buf.ToDevice(a_host.data());
|
||||
b_dev_buf.ToDevice(b_host.data());
|
||||
c_dev_buf.SetZero();
|
||||
scale_a_dev_buf.ToDevice(scale_a_host.data());
|
||||
scale_b_dev_buf.ToDevice(scale_b_host.data());
|
||||
|
||||
ScaleM scale_m(reinterpret_cast<ScaleType*>(scale_a_dev_buf.GetDeviceBuffer()));
|
||||
ScaleN scale_n(reinterpret_cast<ScaleType*>(scale_b_dev_buf.GetDeviceBuffer()));
|
||||
|
||||
MXGemmHostArgs<ScaleM, ScaleN> args(a_dev_buf.GetDeviceBuffer(),
|
||||
b_dev_buf.GetDeviceBuffer(),
|
||||
c_dev_buf.GetDeviceBuffer(),
|
||||
1,
|
||||
M,
|
||||
N,
|
||||
K,
|
||||
stride_A,
|
||||
stride_B,
|
||||
stride_C,
|
||||
scale_m,
|
||||
scale_n);
|
||||
|
||||
mx_gemm_calc<GemmConfig,
|
||||
ADataType,
|
||||
BDataType,
|
||||
AccDataType,
|
||||
CDataType,
|
||||
ALayout,
|
||||
BLayout,
|
||||
CLayout,
|
||||
ScaleM,
|
||||
ScaleN,
|
||||
true,
|
||||
false>(args, ck_tile::stream_config{nullptr, true, 1, 0, 1, true, true, 50});
|
||||
|
||||
c_dev_buf.FromDevice(c_host.data());
|
||||
|
||||
ck_tile::HostTensor<CDataType> c_ref(
|
||||
ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
|
||||
c_ref.SetZero();
|
||||
ck_tile::reference_mx_gemm<ADataType, BDataType, ScaleType, AccDataType, CDataType>(
|
||||
a_host, b_host, c_ref, scale_a_host, scale_b_host);
|
||||
|
||||
const float max_accumulated_value = ck_tile::type_convert<float>(c_ref.max());
|
||||
const auto rtol_atol = calculate_rtol_atol_mx<ADataType, BDataType, AccDataType, CDataType>(
|
||||
K, max_accumulated_value);
|
||||
const double rtol = rtol_atol.at(ck_tile::number<0>{});
|
||||
const double atol = rtol_atol.at(ck_tile::number<1>{});
|
||||
|
||||
bool pass = ck_tile::check_err(c_host, c_ref, "MX GEMM: Incorrect results!", rtol, atol);
|
||||
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
};
|
||||
Reference in New Issue
Block a user