Files
composable_kernel/test/ck_tile/flatmm/mx_flatmm_instance.hpp
Andriy Roshchenko d5c9215064 [rocm-libraries] ROCm/rocm-libraries#7359 (commit dd62f9f)
[CK_TILE][GFX1250] Enable MX GEMM FLATMM with ASYNC

## Motivation

Enables MX GEMM FLATMM pipeline on gfx1250. The pipeline uses an async
load instruction for tensor A, which complements the existing MX GEMM
FLATMM pipeline with TDM load. At this time, only FLATMM MX pipelines
are enabled on gfx1250.

## Technical Details

The existing gfx950 implementation was extended to support gfx1250
architecture. All three MX FP data types are supported across the two
ASICs.
It should be noted that while the TDM pipeline uses an emulated
32x32x128 warp-tile instruction, the present submission relies on the
built-in 16x16x128 instruction, called 4 times per warp.

## Test Plan

Existing `test/ck_tile/flatmm` tests were extended to cover new gfx1250
functionality.

To help facilitate the testing in development,
`example/ck_tile/18_flatmm/script/smoke_test_mx.sh` script was
introduced to verify various combinations of supported data types and
pipeline versions.

## Test Result

The present submission is expected to work on both gfx950 and gfx1250
hardware for all reasonable sizes and all MX FP8/FP6/FP4 data types.

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
- [x] Relies on #6978 and should only be merged after the changes are
merged to the `develop`.
2026-05-29 17:02:45 +00:00

194 lines
9.8 KiB
C++

// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#include <hip/hip_runtime.h>
#include <cstring>
#include <iostream>
#include <ostream>
#include <string>
#include <type_traits>
#include "ck_tile/host.hpp"
#include "mx_flatmm.hpp"
template <typename Layout>
using is_row_major_t = ck_tile::bool_constant<
std::is_same_v<ck_tile::remove_cvref_t<Layout>, ck_tile::tensor_layout::gemm::RowMajor>>;
template <typename MXFlatmmArchTraits,
typename ADataType,
typename BDataType,
typename DsDatatype,
typename AccDataType,
typename CDataType,
typename ALayout,
typename BLayout,
typename DsLayout,
typename CLayout,
typename ScaleM,
typename ScaleN,
bool persistent,
typename CDEElementWise,
bool Splitk,
bool HasHotLoop,
ck_tile::TailNumber TailNum>
float mx_flatmm_calc(const ck_tile::ScaleFlatmmHostArgs<ScaleM, ScaleN>& args,
const ck_tile::stream_config& s)
{
using FlatmmConfig = typename MXFlatmmArchTraits::Config;
using FlatmmShape = ck_tile::TileGemmShape<
ck_tile::sequence<FlatmmConfig::M_Tile, FlatmmConfig::N_Tile, FlatmmConfig::K_Tile>,
ck_tile::sequence<FlatmmConfig::M_Warp, FlatmmConfig::N_Warp, FlatmmConfig::K_Warp>,
ck_tile::sequence<FlatmmConfig::M_Warp_Tile,
FlatmmConfig::N_Warp_Tile,
FlatmmConfig::K_Warp_Tile>>;
using MXGemmTraits = ck_tile::TileGemmUniversalTraits<FlatmmConfig::kPadM,
FlatmmConfig::kPadN,
FlatmmConfig::kPadK,
FlatmmConfig::DoubleSmemBuffer,
ALayout,
BLayout,
CLayout,
FlatmmConfig::TransposeC,
FlatmmConfig::UseStructuredSparsity,
persistent,
FlatmmConfig::NumWaveGroups,
true>;
using ComputeDataType = ADataType;
static_assert(sizeof(ComputeDataType) >= sizeof(BDataType),
"mixed_prec_flatmm requires ADataType is a wider type than BDataType");
constexpr auto scheduler = FlatmmConfig::Scheduler;
ck_tile::ignore = Splitk;
// determined by scale shuffle pattern
constexpr int BlockedXDLN_PerWarp = MXFlatmmArchTraits::BlockedXDLN_PerWarp;
using MXPipelineProblem = ck_tile::MXFlatmmPipelineProblem<ADataType,
BDataType,
AccDataType,
FlatmmShape,
MXGemmTraits,
scheduler,
HasHotLoop,
TailNum>;
using MXFlatmmPipeline =
typename MXFlatmmArchTraits::template MXFlatmmPipeline<MXPipelineProblem>;
using TilePartitioner =
ck_tile::GemmSpatiallyLocalTilePartitioner<FlatmmShape,
FlatmmConfig::TileParitionerGroupNum,
FlatmmConfig::TileParitionerM01>;
using GemmEpilogue = std::conditional_t<
FlatmmConfig::TiledMMAPermuteN,
ck_tile::PermuteNEpilogue<ck_tile::PermuteNEpilogueProblem<ComputeDataType,
ComputeDataType,
DsDatatype,
AccDataType,
CDataType,
DsLayout,
CLayout,
CDEElementWise,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
FlatmmConfig::M_Warp,
FlatmmConfig::N_Warp,
FlatmmConfig::M_Warp_Tile,
FlatmmConfig::N_Warp_Tile,
FlatmmConfig::K_Warp_Tile,
MXPipelineProblem::TransposeC,
false, // FixedVectorSize
1>>, // VectorSizeC
ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<ComputeDataType,
ComputeDataType,
DsDatatype,
AccDataType,
CDataType,
DsLayout,
CLayout,
CDEElementWise,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
FlatmmConfig::M_Warp,
FlatmmConfig::N_Warp,
FlatmmConfig::M_Warp_Tile,
FlatmmConfig::N_Warp_Tile,
FlatmmConfig::K_Warp_Tile,
MXPipelineProblem::TransposeC,
FlatmmConfig::NumWaveGroups,
false, // FixedVectorSize
1, // VectorSizeC
BlockedXDLN_PerWarp>>>;
using Kernel = ck_tile::MXFlatmmKernel<TilePartitioner, MXFlatmmPipeline, GemmEpilogue>;
auto kargs = Kernel::MakeKernelArgs(args);
const dim3 grids = Kernel::GridSize(kargs);
const dim3 blocks = Kernel::BlockSize();
if(!Kernel::IsSupportedArgument(kargs))
throw std::runtime_error("Wrong! Arguments not supported! Skipping gemm!\n");
if(s.log_level_ > 0)
{
std::cout << "Launching kernel with args:" << FlatmmShape::GetName() << "\n"
<< "Shape: " << FlatmmShape::GetName() << "\n"
<< "problem: " << MXPipelineProblem::GetName() << "\n"
<< "pipeline: " << MXFlatmmPipeline::GetName() << "\n"
<< "epilogue: " << GemmEpilogue::GetName() << "\n"
<< "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
<< ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
<< std::endl;
}
// Declare rotating_mem_ptr here so it stays in scope until it is needed
std::unique_ptr<ck_tile::RotatingMemWrapper<ADataType, BDataType>> rotating_mem_ptr;
std::function<void()> preprocess;
auto clear_gemm_output = [&]() {
if(args.k_batch > 1)
hipGetErrorString(
hipMemsetAsync(args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
};
if(s.flush_cache_)
{
std::cout << "Flushing cache..." << std::endl;
constexpr ck_tile::index_t APackedSize = ck_tile::numeric_traits<ADataType>::PackedSize;
constexpr ck_tile::index_t BPackedSize = ck_tile::numeric_traits<BDataType>::PackedSize;
ck_tile::HostTensor<ADataType> a_m(ck_tile::host_tensor_descriptor(
args.M, args.K, args.stride_A, is_row_major_t<ALayout>{}));
ck_tile::HostTensor<BDataType> b_n(ck_tile::host_tensor_descriptor(
args.K, args.N, args.stride_B, is_row_major_t<BLayout>{}));
auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize;
auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize;
rotating_mem_ptr = std::make_unique<ck_tile::RotatingMemWrapper<ADataType, BDataType>>(
kargs.a_ptr, kargs.b_ptr, s.rotating_count_, size_a_buffer, size_b_buffer);
rotating_mem_ptr->Print();
preprocess = [&]() {
ck_tile::flush_icache();
rotating_mem_ptr->Next();
clear_gemm_output();
};
}
else
{
preprocess = clear_gemm_output;
}
return ck_tile::launch_kernel_time_mask(
s,
preprocess,
ck_tile::make_kernel<FlatmmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
}