[rocm-libraries] ROCm/rocm-libraries#4594 (commit 1fce4cb)

[CK_TILE] MX GEMM non-preshuffled RCR layout

## Motivation

Implements a GEMM with MX scaling for fp4 and fp8 in non-preshuffled
layouts using async pipeline.

## Technical Details

<!-- Explain the changes along with any relevant GitHub links. -->

## Test Plan

<!-- Explain any relevant testing done to verify this PR. -->

## Test Result

<!-- Briefly summarize test outcomes. -->

## Submission Checklist

- [ ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
Sami Remes
2026-03-10 20:12:43 +00:00
committed by assistant-librarian[bot]
parent b8def2c724
commit 8f27f65d44
40 changed files with 2729 additions and 43 deletions

View File

@@ -1599,6 +1599,9 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4
static constexpr index_t kCM0PerLane = 1;
static constexpr index_t kCM1PerLane = 4;
// To get unity scale: 2^(kDefaultScale - 127) = 1.0
static constexpr index_t kDefaultScale = 0x7F7F7F7F;
// c_vec += a_vec * b_vec
template <index_t opselA, index_t opselB, bool post_nop_ = false>
CK_TILE_DEVICE void operator()(CVecType& c_vec,
@@ -1669,13 +1672,13 @@ struct WarpGemmAttributeMfmaImpl_f32_16x16x128_f8f6f4
const BVecType& b_vec,
bool_constant<post_nop_> = {}) const
{
operator()<0, 0>(c_vec, a_vec, 0, b_vec, 0);
operator()<0, 0>(c_vec, a_vec, kDefaultScale, b_vec, kDefaultScale);
}
// c_vec = a_vec * b_vec
CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
{
return operator()<0, 0>(a_vec, 0, b_vec, 0);
return operator()<0, 0>(a_vec, kDefaultScale, b_vec, kDefaultScale);
}
};