From 271978ec7c839b6828e1ac90dd2f47cb294dc259 Mon Sep 17 00:00:00 2001 From: "assistant-librarian[bot]" Date: Tue, 20 May 2025 23:06:37 +0000 Subject: [PATCH] Merge commit '990d645578b4a195f5c5b8479eeef47d828faa98' into develop --- .../add_rmsnorm2d_rdquant_fwd.cpp | 21 +++++++++------- profiler/README.md | 24 +++++++++++++++++++ 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp index 574edf64d3..06c04b763e 100644 --- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp +++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp @@ -67,13 +67,14 @@ bool run(const ck_tile::ArgParser& arg_parser) using TypeConfig = AddRmsnormRdquantTypeConfig; - using ADataType = typename TypeConfig::ADataType; - using BDataType = typename TypeConfig::BDataType; - using GammaDataType = typename TypeConfig::GammaDataType; - using XDataType = typename TypeConfig::XDataType; - using YScaleDataType = typename TypeConfig::YScaleDataType; - using QYDataType = typename TypeConfig::QYDataType; - using ComputeDataType = float; + using ADataType = typename TypeConfig::ADataType; + using BDataType = typename TypeConfig::BDataType; + using GammaDataType = typename TypeConfig::GammaDataType; + using XDataType = typename TypeConfig::XDataType; + using YScaleDataType = typename TypeConfig::YScaleDataType; + using QYDataType = typename TypeConfig::QYDataType; + using ComputeDataType = float; + using UnquantYDataType = ck_tile::null_type; // host verify ck_tile::HostTensor a_host({m, n}, {stride, 1}); @@ -184,6 +185,7 @@ bool run(const ck_tile::ArgParser& arg_parser) // Rmsnorm2d { ck_tile::HostTensor invRms_host_ref({m}); + ck_tile::HostTensor unquant_y_host_ref({m, n}); // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for // simplicity @@ -191,8 +193,9 @@ bool run(const ck_tile::ArgParser& arg_parser) GammaDataType, ComputeDataType, YDataType, - InvRmsDataType>( - x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon); + InvRmsDataType, + UnquantYDataType>( + x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon); } // yscale diff --git a/profiler/README.md b/profiler/README.md index 3f4837aada..4398a878bc 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -1,5 +1,29 @@ [Back to the main page](../README.md) # Composable Kernel profiler +## Profiler GEMM UNIVERSAL kernels +```bash +# arg1: tensor operation (gemm_universal: Universal GEMM) +# arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16->f8; 7: f8->bf16, comp f8; 8: f16@i4; 9: bf16@i4 +# arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n]; +# 1: A[m, k] * B[n, k] = C[m, n]; +# 2: A[k, m] * B[k, n] = C[m, n]; +# 3: A[k, m] * B[n, k] = C[m, n]) +# arg4: verification (0: no; 1: yes) +# arg5: initialization (0: no init; 1: integer value; 2: decimal value) +# arg6: print tensor value (0: no; 1: yes) +# arg7: time kernel (0=no, 1=yes) +# arg8 to 13: M, N, K, StrideA, StrideB, StrideC +# arg14: split k into mulitiple batch +# optional: +# arg15: number of warm-up cycles (default 1) +# arg16: number of iterations (default 10) +# arg17: memory for rotating buffer (default 0, size in MB) + + +################ op datatype layout verify init print time M N K StrideA StrideB StrideC SplitK WarmupCycles Iterations MemoryBuffer +./bin/ckProfiler gemm_universal 1 0 1 1 0 1 4096 4096 4096 4096 4096 4096 1 1 10 0 +``` + ## Profile GEMM kernels ```bash #arg1: tensor operation (gemm=GEMM)