From 271978ec7c839b6828e1ac90dd2f47cb294dc259 Mon Sep 17 00:00:00 2001
From: "assistant-librarian[bot]"
 <assistant-librarian[bot]@users.noreply.github.com>
Date: Tue, 20 May 2025 23:06:37 +0000
Subject: [PATCH] Merge commit '990d645578b4a195f5c5b8479eeef47d828faa98' into
 develop

---
 .../add_rmsnorm2d_rdquant_fwd.cpp             | 21 +++++++++-------
 profiler/README.md                            | 24 +++++++++++++++++++
 2 files changed, 36 insertions(+), 9 deletions(-)
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
index 574edf64d3..06c04b763e 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/add_rmsnorm2d_rdquant_fwd.cpp
@@ -67,13 +67,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
 
     using TypeConfig = AddRmsnormRdquantTypeConfig<InputDataType, QuantizedDataType>;
 
-    using ADataType       = typename TypeConfig::ADataType;
-    using BDataType       = typename TypeConfig::BDataType;
-    using GammaDataType   = typename TypeConfig::GammaDataType;
-    using XDataType       = typename TypeConfig::XDataType;
-    using YScaleDataType  = typename TypeConfig::YScaleDataType;
-    using QYDataType      = typename TypeConfig::QYDataType;
-    using ComputeDataType = float;
+    using ADataType        = typename TypeConfig::ADataType;
+    using BDataType        = typename TypeConfig::BDataType;
+    using GammaDataType    = typename TypeConfig::GammaDataType;
+    using XDataType        = typename TypeConfig::XDataType;
+    using YScaleDataType   = typename TypeConfig::YScaleDataType;
+    using QYDataType       = typename TypeConfig::QYDataType;
+    using ComputeDataType  = float;
+    using UnquantYDataType = ck_tile::null_type;
 
     // host verify
     ck_tile::HostTensor<ADataType> a_host({m, n}, {stride, 1});
@@ -184,6 +185,7 @@ bool run(const ck_tile::ArgParser& arg_parser)
         // Rmsnorm2d
         {
             ck_tile::HostTensor<InvRmsDataType> invRms_host_ref({m});
+            ck_tile::HostTensor<UnquantYDataType> unquant_y_host_ref({m, n});
 
             // CAUSION: kernel use ComputeDataType version of x, but we use XDataType here for
             // simplicity
@@ -191,8 +193,9 @@ bool run(const ck_tile::ArgParser& arg_parser)
                                              GammaDataType,
                                              ComputeDataType,
                                              YDataType,
-                                             InvRmsDataType>(
-                x_host_ref, gamma_host, y_host, invRms_host_ref, epsilon);
+                                             InvRmsDataType,
+                                             UnquantYDataType>(
+                x_host_ref, gamma_host, y_host, invRms_host_ref, unquant_y_host_ref, epsilon);
         }
 
         // yscale
diff --git a/profiler/README.md b/profiler/README.md
index 3f4837aada..4398a878bc 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -1,5 +1,29 @@
 [Back to the main page](../README.md)
 # Composable Kernel profiler
+## Profiler GEMM UNIVERSAL kernels
+```bash
+# arg1: tensor operation (gemm_universal: Universal GEMM)
+# arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16->f8; 7: f8->bf16, comp f8; 8: f16@i4; 9: bf16@i4
+# arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];
+#                     1: A[m, k] * B[n, k] = C[m, n];
+#                     2: A[k, m] * B[k, n] = C[m, n];
+#                     3: A[k, m] * B[n, k] = C[m, n])
+# arg4: verification (0: no; 1: yes)
+# arg5: initialization (0: no init; 1: integer value; 2: decimal value)
+# arg6: print tensor value (0: no; 1: yes)
+# arg7: time kernel (0=no, 1=yes)
+# arg8 to 13: M, N, K, StrideA, StrideB, StrideC
+# arg14: split k into  mulitiple batch
+# optional:
+# arg15: number of warm-up cycles (default 1)
+# arg16: number of iterations (default 10)
+# arg17: memory for rotating buffer (default 0, size in MB)
+
+
+################        op  datatype  layout  verify  init  print  time  M N K  StrideA StrideB StrideC  SplitK  WarmupCycles  Iterations  MemoryBuffer
+./bin/ckProfiler gemm_universal 1 0 1 1 0 1 4096 4096 4096 4096 4096 4096 1 1 10 0
+```
+
 ## Profile GEMM kernels
 ```bash
 #arg1: tensor operation (gemm=GEMM)