Fp8 block scale quantization for fmha fwd (#3330)

* add block scale parameters to kernel * add block scale to kernel * add smoke test * format * Revert "format" This reverts commit 356c3c9706. * only format my code * format py * fix auto not allowd in function prototype * change instance tttt to ttff * fix structured binding issue * change s_acc elementwise op * async pipeline add block scale * add quantation P using shift exp2 * precompute (m - shift) once per row * change blk scale seqstrt ptr name * fix some name * fix for deduction guide * fix some comments * add P scale to qr_ksvs_pipeline * add comment to idx_identity * change the method of calculating descale block index * unify naming style: use block_scale_ as name prefix * unify naming style * update the CHANGELOG.md * Add FP8 block scale quantization support for FMHA forward kernel --------- Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com> Co-authored-by: Po Yen Chen <PoYen.Chen@amd.com>
2026-04-20 06:49:15 +00:00 · 2026-01-22 12:58:26 +08:00
parent 4c2c18ef48
commit dd0b4294af
14 changed files with 667 additions and 84 deletions
--- a/include/ck_tile/host/reference/reference_batched_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_batched_gemm.hpp
@@ -47,4 +47,44 @@ CK_TILE_HOST void reference_batched_gemm(const HostTensor<ADataType>& a_b_m_k,
    make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
        std::thread::hardware_concurrency());
 }
+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename AElementOp   = ck_tile::idx_identity,
+          typename BElementOp   = ck_tile::idx_identity,
+          typename ACCElementOp = ck_tile::idx_identity>
+CK_TILE_HOST void reference_batched_quant_gemm(const HostTensor<ADataType>& a_b_m_k,
+                                               const HostTensor<BDataType>& b_b_n_k,
+                                               HostTensor<CDataType>& c_b_m_n,
+                                               const AElementOp& a_element_op     = {},
+                                               const BElementOp& b_element_op     = {},
+                                               const ACCElementOp& acc_element_op = {})
+{
+    const int N = b_b_n_k.mDesc.get_lengths()[1];
+    const int K = b_b_n_k.mDesc.get_lengths()[2];
+
+    auto f = [&](auto batch, auto m) {
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType v_acc = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                AccDataType v_a = ck_tile::type_convert<AccDataType>(
+                    a_element_op(std::make_tuple(batch, m, k), a_b_m_k(batch, m, k)));
+                AccDataType v_b = ck_tile::type_convert<AccDataType>(
+                    b_element_op(std::make_tuple(batch, n, k), b_b_n_k(batch, n, k)));
+
+                v_acc += v_a * v_b;
+            }
+
+            c_b_m_n(batch, m, n) = ck_tile::type_convert<CDataType>(
+                acc_element_op(std::make_tuple(batch, m, n), v_acc));
+        }
+    };
+
+    make_ParallelTensorFunctor(f, c_b_m_n.mDesc.get_lengths()[0], c_b_m_n.mDesc.get_lengths()[1])(
+        std::thread::hardware_concurrency());
+}
 } // namespace ck_tile