mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 05:31:24 +00:00
[rocm-libraries] ROCm/rocm-libraries#5939 (commit 6fb1791)
[CK_TILE] Flatten nested static_for loops into static_ford (#5939) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Mechanical conversion of 129 nested `static_for`/`static_ford` patterns to flat `static_ford` across 29 ck_tile header files. Each conversion eliminates intermediate lambda closure instantiations by replacing nested compile-time loops with a single flat iteration using index decomposition. ### What `static_ford` eliminates When `static_for` loops are nested, each level creates unique closure types: ```cpp // BEFORE: M + M×N = 20 IR functions (for M=4, N=4) static_for<0, 4, 1>{}([&](auto m) { // 4 closure instantiations static_for<0, 4, 1>{}([&](auto n) { // 4×4 = 16 closure instantiations body(m, n); }); }); // AFTER: M×N = 16 IR functions (with ford_applier, no intermediates) static_ford<sequence<4, 4>>{}([&](auto mn) { constexpr auto m = number<mn[number<0>{}]>{}; constexpr auto n = number<mn[number<1>{}]>{}; body(m, n); }); ``` ### Pattern categories converted | Category | Count | Description | |----------|-------|-------------| | C (2-level `static_for` chains) | 112 | Nested `static_for` → `static_ford` | | C3 (3-level `static_for` chains) | 9 | Three consecutive nests → `static_ford` | | Partial rescue | 3 | Outer 2 levels of blocked 4-level nests | | B (nested `static_ford` merge) | 5 | Two nested `static_ford` → single higher-dim `static_ford` | | **Total** | **129** | Across 29 files | 6 false positives were detected and reverted (in `tensor_adaptor.hpp`, `tile_distribution.hpp`, `tile_distribution_encoding.hpp`) where the inner loop bound depended on the outer variable. ### Files changed by family | Family | Files | Sites | |--------|-------|-------| | Block GEMM | 12 | ~20 | | FlatMM pipelines | 4 | ~69 (including 5 ford-ford merges) | | GEMM quant | 7 | ~22 | | FlatMM kernel | 1 | 2 | | FMHA | 1 | 2 | | Reduce/norm | 2 | 2 | | Epilogue | 1 | 1 | ### Blocked locations from review comments - **block_gemm_areg_breg_creg_v1.hpp:356** — BLOCKED: runtime scale loads (`scale_a_slice`, `scale_b_slice`, A warp tensor load) between every nesting level - **block_universal_gemm_ar_aquant_flatbr_bquant_cr.hpp:228** — BLOCKED: `zero_accumulators()` before inner loop; `sched_barrier` + conditional `block_sync_lds()` after inner loop - **block_universal_gemm_as_aquant_bs_bquant_cr.hpp:298** — BLOCKED: runtime `CWarpTensor` construction before inner loop; quantization scale application code after inner loop - **block_universal_gemm_as_aquant_bs_cr.hpp:277** — BLOCKED: same pattern as above - **block_universal_gemm_as_bs_bquant_cr.hpp:367** — BLOCKED: same pattern as above ## Depends on - #5938 ([CK_TILE] Optimize static_ford and sequence compile-time infrastructure) — provides the `ford_applier` that makes these conversions beneficial. Without it, `static_ford` uses a recursive implementation that provides no IR function savings. ## Results (combined with #5938) ### Build Time (Wilcoxon signed-rank, 7 paired trials, gfx942) | Target | Base (s) | Treat (s) | Delta | % | Significant? | |--------|----------|-----------|-------|---|-------------| | **flatmm** | 161.1 | 149.0 | **-12.1s** | **-7.5%** | **YES** (p<0.01, 7/7 wins) | | **universal_gemm** | 225.4 | 220.3 | **-5.1s** | **-2.3%** | **YES** (p<0.01, 7/7 wins) | ### IR Function Counts (device trace, gfx942) | Target | InstFunc | CodeGen | |--------|----------|---------| | universal_gemm | **-8.5%** | **-9.2%** | | flatmm | **-7.6%** | **-10.5%** | ### ASM Equivalence 5/5 PASS — 650,151 lines verified identical (gfx942). TUs: universal_gemm, flatmm_basic, fmha_bwd, reduce, bscale. ## Test plan - [x] ASM equivalence verified (650K lines, gfx942) - [x] Wilcoxon timing verified (7 trials, p<0.01) - [x] IR function counts verified (-7.6% to -10.5% CodeGen reduction) - [ ] CI 🤖 Generated with [Claude Code](https://claude.com/claude-code)
This commit is contained in:
committed by
assistant-librarian[bot]
parent
c2ac7aa7b0
commit
a170e2bd9d
@@ -210,45 +210,45 @@ struct BlockGemmWeightPreshuffleABQuantARegBRegCReg : public BlockGemmQuantBase
|
||||
c_acc;
|
||||
|
||||
auto zero_accumulators = [&] {
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
static_for<0, (WG::kM * WG::kN) / warp_size, 1>{}([&](auto i) {
|
||||
c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
|
||||
}); // make sure WG::CWarpTensor exposes a clear/zero
|
||||
static_ford<sequence<MIterPerWarp, NIterPerWarp, (WG::kM * WG::kN) / warp_size>>{}(
|
||||
[&](auto mni) {
|
||||
constexpr auto mIter = number<mni[number<0>{}]>{};
|
||||
constexpr auto nIter = number<mni[number<1>{}]>{};
|
||||
constexpr auto i = number<mni[number<2>{}]>{};
|
||||
c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
|
||||
});
|
||||
});
|
||||
};
|
||||
static_for<0, QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
zero_accumulators();
|
||||
static_for<0, KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
// warp GEMM
|
||||
WG{}(c_acc(mIter)(nIter),
|
||||
a_warp_tensor(number<AwarpIter>{}),
|
||||
b_warp_tensor(nIter)(number<kIter>{}));
|
||||
});
|
||||
__builtin_amdgcn_sched_barrier(0x7F6);
|
||||
// preload next A from lds
|
||||
if constexpr((kIter * MIterPerWarp + mIter) <
|
||||
(KIterPerWarp * MIterPerWarp - m_preload))
|
||||
{
|
||||
constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
|
||||
constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
|
||||
|
||||
load_and_convert_tile<UnaryOpSize>(
|
||||
a_warp_tensor(number<AwarpIter>{}),
|
||||
a_warp_windows(number<AmIter>{})(number<AkIter>{}));
|
||||
}
|
||||
// barrier
|
||||
// Could be deleted
|
||||
if constexpr((mIter == MIter_2nd_last))
|
||||
{
|
||||
block_sync_lds();
|
||||
}
|
||||
static_ford<sequence<KIterPerQScale, MIterPerWarp>>{}([&](auto km) {
|
||||
constexpr auto kIterInQScale = number<km[number<0>{}]>{};
|
||||
constexpr auto mIter = number<km[number<1>{}]>{};
|
||||
constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
|
||||
constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
// warp GEMM
|
||||
WG{}(c_acc(mIter)(nIter),
|
||||
a_warp_tensor(number<AwarpIter>{}),
|
||||
b_warp_tensor(nIter)(number<kIter>{}));
|
||||
});
|
||||
__builtin_amdgcn_sched_barrier(0x7F6);
|
||||
// preload next A from lds
|
||||
if constexpr((kIter * MIterPerWarp + mIter) <
|
||||
(KIterPerWarp * MIterPerWarp - m_preload))
|
||||
{
|
||||
constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
|
||||
constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
|
||||
|
||||
load_and_convert_tile<UnaryOpSize>(
|
||||
a_warp_tensor(number<AwarpIter>{}),
|
||||
a_warp_windows(number<AmIter>{})(number<AkIter>{}));
|
||||
}
|
||||
// barrier
|
||||
// Could be deleted
|
||||
if constexpr((mIter == MIter_2nd_last))
|
||||
{
|
||||
block_sync_lds();
|
||||
}
|
||||
});
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(aq_block_tensor);
|
||||
|
||||
@@ -127,105 +127,103 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
|
||||
c_acc;
|
||||
|
||||
auto zero_accumulators = [&] {
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
static_for<0, (WG::kM * WG::kN) / warp_size, 1>{}([&](auto i) {
|
||||
c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
|
||||
}); // make sure WG::CWarpTensor exposes a clear/zero
|
||||
static_ford<sequence<MIterPerWarp, NIterPerWarp, (WG::kM * WG::kN) / warp_size>>{}(
|
||||
[&](auto mni) {
|
||||
constexpr auto mIter = number<mni[number<0>{}]>{};
|
||||
constexpr auto nIter = number<mni[number<1>{}]>{};
|
||||
constexpr auto i = number<mni[number<2>{}]>{};
|
||||
c_acc(mIter)(nIter).get_thread_buffer()[i] = 0.0f;
|
||||
});
|
||||
});
|
||||
};
|
||||
static_for<0, QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
zero_accumulators();
|
||||
static_for<0, KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
// warp GEMM
|
||||
WG{}(c_acc(mIter)(nIter),
|
||||
a_warp_tensor(number<AwarpIter>{}),
|
||||
b_warp_tensor(nIter)(number<kIter>{}));
|
||||
});
|
||||
__builtin_amdgcn_sched_barrier(0x7F6);
|
||||
// preload next A from lds
|
||||
if constexpr((kIter * MIterPerWarp + mIter) <
|
||||
(KIterPerWarp * MIterPerWarp - m_preload))
|
||||
{
|
||||
constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
|
||||
constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
|
||||
a_warp_tensor(number<AwarpIter>{}) =
|
||||
load_tile(a_warp_windows(number<AmIter>{})(number<AkIter>{}));
|
||||
}
|
||||
// barrier
|
||||
// Could be deleted
|
||||
if constexpr((mIter == MIter_2nd_last))
|
||||
{
|
||||
block_sync_lds();
|
||||
}
|
||||
});
|
||||
});
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
static_ford<sequence<KIterPerQScale, MIterPerWarp>>{}([&](auto km) {
|
||||
constexpr auto kIterInQScale = number<km[number<0>{}]>{};
|
||||
constexpr auto mIter = number<km[number<1>{}]>{};
|
||||
constexpr auto kIter = kQScale * KIterPerQScale + kIterInQScale;
|
||||
constexpr auto AwarpIter = (kIter * MIterPerWarp + mIter) % m_preload;
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{},
|
||||
c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
// warp GEMM
|
||||
WG{}(c_acc(mIter)(nIter),
|
||||
a_warp_tensor(number<AwarpIter>{}),
|
||||
b_warp_tensor(nIter)(number<kIter>{}));
|
||||
});
|
||||
__builtin_amdgcn_sched_barrier(0x7F6);
|
||||
// preload next A from lds
|
||||
if constexpr((kIter * MIterPerWarp + mIter) <
|
||||
(KIterPerWarp * MIterPerWarp - m_preload))
|
||||
{
|
||||
constexpr auto AmIter = (mIter + m_preload) % MIterPerWarp;
|
||||
constexpr auto AkIter = (kIter + (mIter + m_preload) / MIterPerWarp);
|
||||
a_warp_tensor(number<AwarpIter>{}) =
|
||||
load_tile(a_warp_windows(number<AmIter>{})(number<AkIter>{}));
|
||||
}
|
||||
// barrier
|
||||
// Could be deleted
|
||||
if constexpr((mIter == MIter_2nd_last))
|
||||
{
|
||||
block_sync_lds();
|
||||
}
|
||||
});
|
||||
static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
|
||||
constexpr auto mIter = number<mn[number<0>{}]>{};
|
||||
constexpr auto nIter = number<mn[number<1>{}]>{};
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
|
||||
if constexpr(BPreshuffleQuant)
|
||||
if constexpr(BPreshuffleQuant)
|
||||
{
|
||||
constexpr index_t reg_offset = nIter;
|
||||
auto pull_from_lane = (__lane_id() & (WG::kN - 1)) * KPerBlockBQ + kQScale;
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
// cross lane ops
|
||||
uint32_t scale_reg_dword;
|
||||
|
||||
if constexpr(std::is_same_v<BQDataType, float>)
|
||||
{
|
||||
constexpr index_t reg_offset = nIter;
|
||||
auto pull_from_lane = (__lane_id() & (WG::kN - 1)) * KPerBlockBQ + kQScale;
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
// cross lane ops
|
||||
uint32_t scale_reg_dword;
|
||||
|
||||
if constexpr(std::is_same_v<BQDataType, float>)
|
||||
{
|
||||
scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_reg_dword = static_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
|
||||
// cross lane ops to get the value of scale_reg.
|
||||
int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
|
||||
pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
|
||||
|
||||
float scale_reg_f = cvt_scale_to_fp32(gathered_scale_reg);
|
||||
|
||||
static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
|
||||
auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
|
||||
const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
|
||||
c_ref = c_ref + acc_val * scale_reg_f;
|
||||
});
|
||||
scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
index_t reg_offset = [&]() {
|
||||
if constexpr(BQuantGroupSize::kN >= (NWarp * WG::kN))
|
||||
{
|
||||
return (nIter * NWarp * WG::kN) / BQuantGroupSize::kN *
|
||||
KPerBlockBQ +
|
||||
kQScale;
|
||||
}
|
||||
else
|
||||
{
|
||||
return nIter * KPerBlockBQ + kQScale;
|
||||
}
|
||||
}();
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float scale_reg_f = cvt_scale_to_fp32(scale_reg);
|
||||
|
||||
static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
|
||||
auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
|
||||
const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
|
||||
c_ref = c_ref + acc_val * scale_reg_f;
|
||||
});
|
||||
scale_reg_dword = static_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
});
|
||||
|
||||
// cross lane ops to get the value of scale_reg.
|
||||
int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
|
||||
pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
|
||||
|
||||
float scale_reg_f = cvt_scale_to_fp32(gathered_scale_reg);
|
||||
|
||||
static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
|
||||
auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
|
||||
const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
|
||||
c_ref = c_ref + acc_val * scale_reg_f;
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
index_t reg_offset = [&]() {
|
||||
if constexpr(BQuantGroupSize::kN >= (NWarp * WG::kN))
|
||||
{
|
||||
return (nIter * NWarp * WG::kN) / BQuantGroupSize::kN * KPerBlockBQ +
|
||||
kQScale;
|
||||
}
|
||||
else
|
||||
{
|
||||
return nIter * KPerBlockBQ + kQScale;
|
||||
}
|
||||
}();
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float scale_reg_f = cvt_scale_to_fp32(scale_reg);
|
||||
|
||||
static_for<0, WG::kM * WG::kN / warp_size, 1>{}([&](auto c_row) {
|
||||
auto& c_ref = c_block_tensor.get_thread_buffer()[tbuf_offset + c_row];
|
||||
const auto acc_val = c_acc(mIter)(nIter).get_thread_buffer()[c_row];
|
||||
c_ref = c_ref + acc_val * scale_reg_f;
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -290,121 +290,115 @@ struct ABQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase
|
||||
constexpr auto warp_size = get_warp_size();
|
||||
|
||||
// hot loop:
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
CWarpTensor c_warp_tensor;
|
||||
static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
|
||||
constexpr auto mIter = number<mn[number<0>{}]>{};
|
||||
constexpr auto nIter = number<mn[number<1>{}]>{};
|
||||
CWarpTensor c_warp_tensor;
|
||||
|
||||
static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
|
||||
AWarpTensor a_warp_tensor;
|
||||
a_warp_tensor.get_thread_buffer() =
|
||||
a_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
|
||||
AWarpTensor a_warp_tensor;
|
||||
a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
|
||||
|
||||
BWarpTensor b_warp_tensor;
|
||||
b_warp_tensor.get_thread_buffer() =
|
||||
b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
BWarpTensor b_warp_tensor;
|
||||
b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
|
||||
if constexpr(kIterInQScale == 0)
|
||||
{
|
||||
c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
else
|
||||
{
|
||||
WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
});
|
||||
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{},
|
||||
c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
// a_scale
|
||||
AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
|
||||
aq_block_tensor);
|
||||
|
||||
if constexpr(BPreshuffleQuant)
|
||||
if constexpr(kIterInQScale == 0)
|
||||
{
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >
|
||||
(NWarp * WarpGemm::kN) &&
|
||||
Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
|
||||
{
|
||||
return kQScale;
|
||||
}
|
||||
else
|
||||
{
|
||||
return nIter;
|
||||
}
|
||||
}();
|
||||
|
||||
auto pull_from_lane =
|
||||
(__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
// cross lane ops
|
||||
uint32_t scale_reg_dword;
|
||||
|
||||
if constexpr(std::is_same_v<BQDataType, float>)
|
||||
{
|
||||
scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_reg_dword = static_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
|
||||
// cross lane ops to get the value of scale_reg.
|
||||
int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
|
||||
pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
|
||||
|
||||
float b_scale_reg_f =
|
||||
Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
|
||||
gathered_scale_reg);
|
||||
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
float a_scale_reg_f = aq_picker.template pick<c_row>();
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
|
||||
b_scale_reg_f);
|
||||
});
|
||||
c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Multiply bquant with accumulated C
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >=
|
||||
(NWarp * WarpGemm::kN))
|
||||
return (nIter * NWarp * WarpGemm::kN) /
|
||||
GemmTraits::BQuantGroupSize::kN *
|
||||
Traits::KQPerBlock +
|
||||
kQScale;
|
||||
else
|
||||
{
|
||||
return nIter * Traits::KQPerBlock + kQScale;
|
||||
}
|
||||
}();
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float b_scale_reg_f =
|
||||
Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
|
||||
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
float a_scale_reg_f = aq_picker.template pick<c_row>();
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
|
||||
b_scale_reg_f);
|
||||
});
|
||||
WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
});
|
||||
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{},
|
||||
c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
// a_scale
|
||||
AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
|
||||
aq_block_tensor);
|
||||
|
||||
if constexpr(BPreshuffleQuant)
|
||||
{
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN > (NWarp * WarpGemm::kN) &&
|
||||
Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
|
||||
{
|
||||
return kQScale;
|
||||
}
|
||||
else
|
||||
{
|
||||
return nIter;
|
||||
}
|
||||
}();
|
||||
|
||||
auto pull_from_lane =
|
||||
(__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
// cross lane ops
|
||||
uint32_t scale_reg_dword;
|
||||
|
||||
if constexpr(std::is_same_v<BQDataType, float>)
|
||||
{
|
||||
scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_reg_dword = static_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
|
||||
// cross lane ops to get the value of scale_reg.
|
||||
int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
|
||||
pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
|
||||
|
||||
float b_scale_reg_f = Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
|
||||
gathered_scale_reg);
|
||||
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
float a_scale_reg_f = aq_picker.template pick<c_row>();
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
|
||||
b_scale_reg_f);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
// Multiply bquant with accumulated C
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
|
||||
return (nIter * NWarp * WarpGemm::kN) /
|
||||
GemmTraits::BQuantGroupSize::kN * Traits::KQPerBlock +
|
||||
kQScale;
|
||||
else
|
||||
{
|
||||
return nIter * Traits::KQPerBlock + kQScale;
|
||||
}
|
||||
}();
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float b_scale_reg_f =
|
||||
Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
|
||||
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
float a_scale_reg_f = aq_picker.template pick<c_row>();
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * a_scale_reg_f *
|
||||
b_scale_reg_f);
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -268,54 +268,51 @@ struct AQuantBlockUniversalGemmAsBsCr
|
||||
constexpr auto warp_size = get_warp_size();
|
||||
|
||||
// hot loop:
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
CWarpTensor c_warp_tensor;
|
||||
static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
|
||||
constexpr auto mIter = number<mn[number<0>{}]>{};
|
||||
constexpr auto nIter = number<mn[number<1>{}]>{};
|
||||
CWarpTensor c_warp_tensor;
|
||||
|
||||
// for every column in AQ
|
||||
static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
// for every warp corresponding to a quantization scale
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
// for every column in AQ
|
||||
static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
// for every warp corresponding to a quantization scale
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
|
||||
AWarpTensor a_warp_tensor;
|
||||
a_warp_tensor.get_thread_buffer() =
|
||||
a_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
|
||||
AWarpTensor a_warp_tensor;
|
||||
a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
|
||||
|
||||
BWarpTensor b_warp_tensor;
|
||||
b_warp_tensor.get_thread_buffer() =
|
||||
b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
BWarpTensor b_warp_tensor;
|
||||
b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
|
||||
if constexpr(kIterInQScale == 0)
|
||||
{
|
||||
c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
else
|
||||
{
|
||||
WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
});
|
||||
if constexpr(kIterInQScale == 0)
|
||||
{
|
||||
c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
else
|
||||
{
|
||||
WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
});
|
||||
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{},
|
||||
c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{},
|
||||
c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
|
||||
AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
|
||||
aq_block_tensor);
|
||||
AQPickerCommon<AQBlockTensor, Traits, mIter, kQScale> aq_picker(
|
||||
aq_block_tensor);
|
||||
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
float scale_reg_f = aq_picker.template pick<c_row>();
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}([&](auto c_row) {
|
||||
float scale_reg_f = aq_picker.template pick<c_row>();
|
||||
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
|
||||
});
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
@@ -290,57 +290,55 @@ struct BQuantBlockUniversalGemmAsBsCr
|
||||
using SrcVectorRawType = ext_vector_t<BDataTypeRaw, UnaryOpSize_ / BPackedSize>;
|
||||
using DstVectorType = ext_vector_t<ComputeDataType, UnaryOpSize_>;
|
||||
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
// B scale register offset
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
|
||||
return ((nIter * NWarp * WarpGemm::kN) /
|
||||
GemmTraits::BQuantGroupSize::kN) *
|
||||
Traits::KQPerBlock +
|
||||
kQScale;
|
||||
else
|
||||
{
|
||||
return nIter * Traits::KQPerBlock + kQScale;
|
||||
}
|
||||
}();
|
||||
static_ford<sequence<NIterPerWarp, Traits::QScalesPerBlockRow>>{}([&](auto nk) {
|
||||
constexpr auto nIter = number<nk[number<0>{}]>{};
|
||||
constexpr auto kQScale = number<nk[number<1>{}]>{};
|
||||
// B scale register offset
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
|
||||
return ((nIter * NWarp * WarpGemm::kN) / GemmTraits::BQuantGroupSize::kN) *
|
||||
Traits::KQPerBlock +
|
||||
kQScale;
|
||||
else
|
||||
{
|
||||
return nIter * Traits::KQPerBlock + kQScale;
|
||||
}
|
||||
}();
|
||||
|
||||
// Get B scale from thread buffer
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float b_scale_f = float(scale_reg);
|
||||
// Get B scale from thread buffer
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float b_scale_f = float(scale_reg);
|
||||
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
// Thread buffers
|
||||
using BWarpThreadBuffer = decltype(b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
|
||||
using BLDSThreadBuffer = decltype(b_warp_tile_lds_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
// Thread buffers
|
||||
using BWarpThreadBuffer = decltype(b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
|
||||
using BLDSThreadBuffer = decltype(b_warp_tile_lds_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths)));
|
||||
|
||||
BWarpThreadBuffer b_warp_thread_buffer;
|
||||
BLDSThreadBuffer b_lds_thread_buffer;
|
||||
BWarpThreadBuffer b_warp_thread_buffer;
|
||||
BLDSThreadBuffer b_lds_thread_buffer;
|
||||
|
||||
// Load thread buffer from tile (LDS type)
|
||||
b_lds_thread_buffer = b_warp_tile_lds_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
// Load thread buffer from tile (LDS type)
|
||||
b_lds_thread_buffer = b_warp_tile_lds_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
|
||||
// Apply scale to B thread buffer and cast
|
||||
static_for<0, thread_buffer_size, 1>{}([&](auto i) {
|
||||
elementwise_op(
|
||||
b_warp_thread_buffer.template get_as<DstVectorType>()(i),
|
||||
b_lds_thread_buffer.template get_as<SrcVectorRawType>()[i],
|
||||
b_scale_f);
|
||||
});
|
||||
|
||||
// Store B thread buffer to tile (MMA type)
|
||||
b_warp_tile_.set_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths),
|
||||
b_warp_thread_buffer);
|
||||
// Apply scale to B thread buffer and cast
|
||||
static_for<0, thread_buffer_size, 1>{}([&](auto i) {
|
||||
elementwise_op(b_warp_thread_buffer.template get_as<DstVectorType>()(i),
|
||||
b_lds_thread_buffer.template get_as<SrcVectorRawType>()[i],
|
||||
b_scale_f);
|
||||
});
|
||||
|
||||
// Store B thread buffer to tile (MMA type)
|
||||
b_warp_tile_.set_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths),
|
||||
b_warp_thread_buffer);
|
||||
});
|
||||
});
|
||||
}
|
||||
@@ -361,113 +359,107 @@ struct BQuantBlockUniversalGemmAsBsCr
|
||||
constexpr auto warp_size = get_warp_size();
|
||||
|
||||
// hot loop:
|
||||
static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
|
||||
static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
|
||||
CWarpTensor c_warp_tensor;
|
||||
static_ford<sequence<MIterPerWarp, NIterPerWarp>>{}([&](auto mn) {
|
||||
constexpr auto mIter = number<mn[number<0>{}]>{};
|
||||
constexpr auto nIter = number<mn[number<1>{}]>{};
|
||||
CWarpTensor c_warp_tensor;
|
||||
|
||||
static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
static_for<0, Traits::QScalesPerBlockRow, 1>{}([&](auto kQScale) {
|
||||
static_for<0, Traits::KIterPerQScale, 1>{}([&](auto kIterInQScale) {
|
||||
constexpr auto kIter = kQScale * Traits::KIterPerQScale + kIterInQScale;
|
||||
|
||||
AWarpTensor a_warp_tensor;
|
||||
a_warp_tensor.get_thread_buffer() =
|
||||
a_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
|
||||
AWarpTensor a_warp_tensor;
|
||||
a_warp_tensor.get_thread_buffer() = a_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<mIter, kIter>{}, a_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, a_warp_y_lengths));
|
||||
|
||||
BWarpTensor b_warp_tensor;
|
||||
b_warp_tensor.get_thread_buffer() =
|
||||
b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
BWarpTensor b_warp_tensor;
|
||||
b_warp_tensor.get_thread_buffer() = b_warp_tile_.get_y_sliced_thread_data(
|
||||
merge_sequences(sequence<nIter, kIter>{}, b_warp_y_index_zeros),
|
||||
merge_sequences(sequence<1, 1>{}, b_warp_y_lengths));
|
||||
|
||||
if constexpr(kIterInQScale == 0)
|
||||
{
|
||||
c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
else
|
||||
{
|
||||
WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
});
|
||||
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{},
|
||||
c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
|
||||
if constexpr(BPreshuffleQuant)
|
||||
if constexpr(kIterInQScale == 0)
|
||||
{
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >
|
||||
(NWarp * WarpGemm::kN) &&
|
||||
Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
|
||||
{
|
||||
return kQScale; // prefill: one quant group per block
|
||||
}
|
||||
else
|
||||
{
|
||||
return nIter; // decode or multiple groups per warp
|
||||
}
|
||||
}();
|
||||
|
||||
auto pull_from_lane =
|
||||
(__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
// cross lane ops
|
||||
uint32_t scale_reg_dword;
|
||||
|
||||
if constexpr(std::is_same_v<BQDataType, float>)
|
||||
{
|
||||
scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_reg_dword = static_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
|
||||
// cross lane ops to get the value of scale_reg.
|
||||
int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
|
||||
pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
|
||||
|
||||
float scale_reg_f =
|
||||
Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
|
||||
gathered_scale_reg);
|
||||
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
|
||||
});
|
||||
c_warp_tensor = WarpGemm{}(a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Multiply bquant with accumulated C
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >=
|
||||
(NWarp * WarpGemm::kN))
|
||||
return (nIter * NWarp * WarpGemm::kN) /
|
||||
GemmTraits::BQuantGroupSize::kN *
|
||||
Traits::KQPerBlock +
|
||||
kQScale;
|
||||
else
|
||||
{
|
||||
return nIter * Traits::KQPerBlock + kQScale;
|
||||
}
|
||||
}();
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float scale_reg_f =
|
||||
Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
|
||||
});
|
||||
WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
|
||||
}
|
||||
});
|
||||
|
||||
constexpr auto tbuf_offset =
|
||||
number<typename CBlockTensor::ThreadTensorDesc{}.calculate_offset(
|
||||
merge_sequences(sequence<mIter, nIter>{},
|
||||
c_warp_y_index_zeros)) /
|
||||
CBlockTensor::PackedSize>{};
|
||||
|
||||
if constexpr(BPreshuffleQuant)
|
||||
{
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN > (NWarp * WarpGemm::kN) &&
|
||||
Traits::NPerBlock == GemmTraits::BQuantGroupSize::kN)
|
||||
{
|
||||
return kQScale; // prefill: one quant group per block
|
||||
}
|
||||
else
|
||||
{
|
||||
return nIter; // decode or multiple groups per warp
|
||||
}
|
||||
}();
|
||||
|
||||
auto pull_from_lane =
|
||||
(__lane_id() & (WarpGemm::kN - 1)) * Traits::KQPerBlock + kQScale;
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
// cross lane ops
|
||||
uint32_t scale_reg_dword;
|
||||
|
||||
if constexpr(std::is_same_v<BQDataType, float>)
|
||||
{
|
||||
scale_reg_dword = ck_tile::bit_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
else
|
||||
{
|
||||
scale_reg_dword = static_cast<uint32_t>(scale_reg);
|
||||
}
|
||||
|
||||
// cross lane ops to get the value of scale_reg.
|
||||
int gathered_scale_reg = __builtin_amdgcn_ds_bpermute(
|
||||
pull_from_lane << 2, __builtin_bit_cast(int, scale_reg_dword));
|
||||
|
||||
float scale_reg_f = Base::cvt_scale_to_fp32<typename Traits::BQDataType>(
|
||||
gathered_scale_reg);
|
||||
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
// Multiply bquant with accumulated C
|
||||
constexpr index_t reg_offset = [&]() {
|
||||
if constexpr(GemmTraits::BQuantGroupSize::kN >= (NWarp * WarpGemm::kN))
|
||||
return (nIter * NWarp * WarpGemm::kN) /
|
||||
GemmTraits::BQuantGroupSize::kN * Traits::KQPerBlock +
|
||||
kQScale;
|
||||
else
|
||||
{
|
||||
return nIter * Traits::KQPerBlock + kQScale;
|
||||
}
|
||||
}();
|
||||
|
||||
auto& scale_reg = bq_block_tensor.get_thread_buffer()[reg_offset];
|
||||
float scale_reg_f =
|
||||
Base::cvt_scale_to_fp32<typename Traits::BQDataType>(scale_reg);
|
||||
static_for<0, WarpGemm::kM * WarpGemm::kN / warp_size, 1>{}(
|
||||
[&](auto c_row) {
|
||||
c_block_tensor.get_thread_buffer()[tbuf_offset + c_row] +=
|
||||
(c_warp_tensor.get_thread_buffer()[c_row] * scale_reg_f);
|
||||
});
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user