Add json dump support to output details from CK/CKTile Examples. (#2551)

* Adding RapidJson Library * Adding Json Dumps in all CK_Tile Examples Not verified yet * Adding json to cktile Batched Transpose * adding json dumps to layernorm2d_fwd * Adding json dump to flatmm_basic * Adding RapidJson Library * Adding Json Dumps in all CK_Tile Examples Not verified yet * Adding json to cktile Batched Transpose * adding json dumps to layernorm2d_fwd * Adding json dump to flatmm_basic * Adding json in 03_gemm * Add json dump to 16_batched_gemm * Add json dump to gemm_multi_d_fp16 * Add json dump to grouped_gemm * fix fmha_bwd/fwd * Fix clang-format errors exclude include/rapidjson in jenkins as its a third-party library * Saparating function and defination. * Update Documentation of 03_gemm * Refactoring as per code review * Disable fp8 instances on unsupported targets (#2592) * Restrict building of gemm_universal_preshuffle_f8 instances to specific targets in CMakeLists.txt * Add condition to skip gemm_xdl_universal_preshuffle_f8 instances for unsupported targets in CMakeLists.txt * Add conditions to skip unsupported targets for gemm_universal_preshuffle_f8 and gemm_xdl_universal_preshuffle_f8 instances in CMakeLists.txt * Refine conditions to exclude gemm_universal_preshuffle_f8 instances for unsupported targets in CMakeLists.txt --------- Co-authored-by: AviralGoelAMD <aviralgoel@amd.com> * fix clang format * remove duplicate lines of code from library/src/tensor_operation_instance/gpu/CMakeLists.txt * Fixing Readme and unifying jsondumps * adding moe_smoothquant * adding fused_moe * Fixing Readme for batched_gemm * Fixing Readme for grouped_gemm * adding flatmm * adding gemm_multi_d_fp16 * adding elementwise * adding File name when json is dumped * Fixing Reduce after merge * adding batched_transpose * Adding Warptile in Gemm * Fixing Clang Format --------- Co-authored-by: Aviral Goel <aviral.goel@amd.com> Co-authored-by: AviralGoelAMD <aviralgoel@amd.com> Co-authored-by: illsilin_amdeng <Illia.Silin@amd.com>
2026-04-20 14:59:17 +00:00 · 2025-09-03 12:01:29 +05:30
parent e1ab460d2d
commit 4d041837ad
88 changed files with 21219 additions and 856 deletions
--- a/example/ck_tile/10_rmsnorm2d/README.md
+++ b/example/ck_tile/10_rmsnorm2d/README.md
@@ -6,17 +6,34 @@ This folder contains example for Rmsnorm2D forward using ck_tile tile-programmin
 ```
 # in the root of ck_tile
 mkdir build && cd build
-../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
-make tile_rmsnorm2d_fwd -j
+sh ../script/cmake-ck-dev.sh  ../ <arch>  # you can replace this <arch> to gfx90a, gfx942...
+make tile_rmsnorm2d_fwd -j`nproc`
 ```
 This will result in an executable `build/bin/tile_rmsnorm2d_fwd`

 ## cmdline
 ```
 args:
-          -m    m dimension (default:3328)
-          -n    m dimension (default:4096)
-          -e    epsilon (default:1e-5)
-          -v    cpu validation or not (default:1)
-       -prec    precision (default:fp16)
+           -m    m dimension (default:3328)
+           -n    n dimension (default:4096)
+    -x_stride    x row_stride, if -1 then equal to n (default:-1)
+   -xr_stride    x residule row_stride, if -1 then equal to n (default:-1)
+    -y_stride    y row_stride, if -1 then equal to n (default:-1)
+   -yr_stride    y residule row_stride, if -1 then equal to n (default:-1)
+           -e    epsilon (default:1e-5)
+    -save_rms    save rms(invrms) or not. set to 1 in training case (default:0)
+-save_unquant    save result before quant (default:0)
+           -v    cpu validation or not (default:1)
+       -kname    print kernel name or not (default:1)
+      -prec_i    input precision (default:fp16)
+      -prec_o    output precision, set auto will be the same as input (default:auto)
+     -prec_sm    output quant scale type, set auto will use fp32. used when fquant=1 (default:auto)
+     -prec_sy    output quant scale type, set auto will use fp32. used when fquant=1 or 2 (default:auto)
+        -fadd    fused-add, 0:no fused add, 1:preadd+store, 2:preadd only (default:0)
+      -fquant    fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant (default:0)
+      -warmup    cold iter (default:5)
+      -repeat    hot iter (default:20)
+           -s    sensitive model mode, 0: for no specific model, 1: for T5-like model (default:0)
+        -json    0: No Json, 1: Dump Results in Json format (default:0)
+    -jsonfile    json file name to dump results (default:rmsnorm2d_fwd.json)
 ```
--- a/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
+++ b/example/ck_tile/10_rmsnorm2d/rmsnorm2d_fwd.cpp
@@ -1,6 +1,7 @@
 #include "ck_tile/host.hpp"
 #include "rmsnorm2d_fwd.hpp"
 #include <cstring>
+#include "json_dump.hpp"

 // different threshold for different dtype
 template <typename DataType>
@@ -53,7 +54,9 @@ auto create_args(int argc, char* argv[])
        .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant")
        .insert("warmup", "5", "cold iter")
        .insert("repeat", "20", "hot iter")
-        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model");
+        .insert("s", "0", "sensitive model mode, 0: for no specific model, 1: for T5-like model")
+        .insert("json", "0", "0: No Json, 1: Dump Results in Json format")
+        .insert("jsonfile", "rmsnorm2d_fwd.json", "json file name to dump results");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
@@ -437,6 +440,23 @@ bool run(const ck_tile::ArgParser& arg_parser)
        std::cout << ", valid:" << (pass ? "y" : "n") << std::flush << std::endl;
    }

+    if(arg_parser.get_int("json") == 1)
+    {
+        dump_rmsnorm2d_fwd_json(arg_parser.get_str("jsonfile"),
+                                prec_str,
+                                m,
+                                n,
+                                x_stride,
+                                xr_stride,
+                                y_stride,
+                                yr_stride,
+                                use_model_sensitive_rmsnorm,
+                                ave_time,
+                                0,
+                                gb_per_sec,
+                                pass);
+    }
+
    return pass;
 }