[CK_TILE] Update flatmm related kernels (#3022)

--------- Co-authored-by: Ding, Yi <yi.ding@amd.com> Co-authored-by: felix <felix.li@amd.com>
2026-04-20 06:49:15 +00:00 · 2025-10-22 22:36:11 +08:00
parent cbd1279ae6
commit 211d64e18a
39 changed files with 11183 additions and 739 deletions
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -480,6 +480,14 @@ __global__ void naive_gemm_kernel(ADataType* A,
                else
                    v_a = fp32_val.lo;
            }
+            else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
+            {
+                const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
            else
            {
                v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
@@ -492,6 +500,14 @@ __global__ void naive_gemm_kernel(ADataType* A,
                else
                    v_b = fp32_val.lo;
            }
+            else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
+            {
+                const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b]);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
            else
            {
                v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
@@ -506,6 +522,121 @@ __global__ void naive_gemm_kernel(ADataType* A,
    }
 }

+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC>
+__global__ void blockwise_gemm_kernel(ADataType* A,
+                                      BDataType* B,
+                                      CDataType* C,
+                                      ck_tile::index_t M,
+                                      ck_tile::index_t N,
+                                      ck_tile::index_t K,
+                                      ck_tile::index_t strideA,
+                                      ck_tile::index_t strideB,
+                                      ck_tile::index_t strideC,
+                                      ck_tile::index_t scale_granularity_m,
+                                      ck_tile::index_t scale_granularity_n,
+                                      ck_tile::index_t scale_granularity_k,
+                                      float* scale_A_ptr,
+                                      float* scale_B_ptr)
+{
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int row = idx / N; // Compute row index
+    int col = idx % N; // Compute column index
+
+    if(row < M && col < N)
+    {
+        AccDataType acc = 0.0, acc_temp = 0.0;
+
+        index_t scale_A_stride = (M + scale_granularity_m - 1) / scale_granularity_m;
+        index_t scale_B_stride = (N + scale_granularity_n - 1) / scale_granularity_n;
+
+        float scale_A = 0;
+        float scale_B = 0;
+
+        for(int k = 0; k < K; ++k)
+        {
+            if(k % scale_granularity_k == 0)
+            {
+                // update acc
+                acc += acc_temp * scale_A * scale_B;
+                acc_temp = 0.0;
+                // update scale factors
+                scale_A = scale_A_ptr[(row / scale_granularity_m) +
+                                      (k / scale_granularity_k) * scale_A_stride];
+                scale_B = scale_B_ptr[(col / scale_granularity_n) +
+                                      (k / scale_granularity_k) * scale_B_stride];
+            }
+
+            constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
+            constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
+            // Adjust indexing based on matrix layout
+            int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
+                              ? row * strideA + k
+                              : k * strideA + row;
+            int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
+                              ? col * strideB + k
+                              : k * strideB + col;
+
+            AccDataType v_a;
+            AccDataType v_b;
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else if constexpr(std::is_same_v<ADataType, pk_fp4_t>)
+            {
+                const fp32x2_t fp32_val = pk_fp4_to_fp32x2(A[a_index / packed_size_a]);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
+            }
+
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else if constexpr(std::is_same_v<BDataType, pk_fp4_t>)
+            {
+                const fp32x2_t fp32_val = pk_fp4_to_fp32x2(B[b_index / packed_size_b], 1.0f);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
+            }
+            acc_temp += v_a * v_b;
+        }
+        // final accumulation
+        acc += acc_temp * scale_A * scale_B;
+
+        int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
+                          ? row * strideC + col
+                          : col * strideC + row;
+        C[c_index]  = ck_tile::type_convert<CDataType>(acc);
+    }
+}
+
 template <typename ADataType,
          typename BDataType,
          typename AccDataType,
@@ -534,6 +665,51 @@ void reference_gemm_gpu(ADataType* a_ptr,
    return;
 }

+template <typename ADataType,
+          typename BDataType,
+          typename AccDataType,
+          typename CDataType,
+          typename LayoutA,
+          typename LayoutB,
+          typename LayoutC>
+void reference_blockwise_gemm_gpu(ADataType* a_ptr,
+                                  BDataType* b_ptr,
+                                  CDataType* c_ptr,
+                                  index_t M,
+                                  index_t N,
+                                  index_t K,
+                                  index_t stride_a,
+                                  index_t stride_b,
+                                  index_t stride_c,
+                                  index_t scale_granularity_m,
+                                  index_t scale_granularity_n,
+                                  index_t scale_granularity_k,
+                                  float* scale_A_ptr,
+                                  float* scale_B_ptr)
+{
+    int totalElements      = M * N;
+    int numThreadsPerBlock = 256; // Common choice for threads per block
+    int numBlocks          = (totalElements + numThreadsPerBlock - 1) / numThreadsPerBlock;
+
+    blockwise_gemm_kernel<ADataType, BDataType, AccDataType, CDataType, LayoutA, LayoutB, LayoutC>
+        <<<numBlocks, numThreadsPerBlock>>>(a_ptr,
+                                            b_ptr,
+                                            c_ptr,
+                                            M,
+                                            N,
+                                            K,
+                                            stride_a,
+                                            stride_b,
+                                            stride_c,
+                                            scale_granularity_m,
+                                            scale_granularity_n,
+                                            scale_granularity_k,
+                                            scale_A_ptr,
+                                            scale_B_ptr);
+
+    return;
+}
+
 template <typename ADataType,
          typename BDataType,
          typename AccDataType,
@@ -571,4 +747,5 @@ void reference_batched_gemm_gpu(ADataType* a_ptr,

    return;
 }
+
 } // namespace ck_tile