fix doc

2026-06-29 19:28:33 +00:00 · 2025-05-06 08:29:11 +00:00
parent 45a74f2b24
commit bc72ec4cfb
2 changed files with 68 additions and 38 deletions
--- a/tile_engine/ops/gemm/README.md
+++ b/tile_engine/ops/gemm/README.md
@@ -20,25 +20,24 @@ make tile_engine_gemm -j
 ## tile_engine_gemm inputs
 ```

-          -m    m dimension (default:3840)
-          -n    n dimension (default:4096)
-          -k    k dimension (default:2048)
-   -stride_a    Tensor A stride (default:0)
-   -stride_b    Tensor B stride (default:0)
-   -stride_c    Tensor C stride (default:0)
-    -split_k    SplitK value (default:1)
-          -v    No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2)
-     -metric    The metric value of kernel performance - latency: 0, tflops: 1, bandwidth: 2 (default:0)
-     -warmup    Number of iterations before benchmark the kernel (default:50)
-     -repeat    Number of iterations to benchmark the kernel (default:100)
-      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
-       -init    Value for initializing tensor - random: 0, linear: 1, constant(1): 2 (default:0)
-   -pipeline    possible values are: compv3, compv4, mem (default:compv3)
-  -scheduler    possible values are: intrawave, interwave (default:intrawave)
-   -epilogue    possible values are: cshuffle, default (default:cshuffle)
-      -pad_m    Pad in m direction - true/false (default:false)
-      -pad_n    Pad in n direction - true/false (default:false)
-      -pad_k    Pad in k direction - true/false (default:false)
+          -m    The value for m dimension. Default is 3840.
+          -n    The value for n dimension. Default is 4096.
+          -k    The value for k dimension. Default is 2048.
+   -stride_a    The stride value for tensor A. Default is 0.
+   -stride_b    The stride value for tensor B. Default is 0.
+   -stride_c    The stride value for tensor C  Default is 0.
+    -split_k    The split value for k dimension. Default is 1.
+          -v    The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU.
+     -warmup    The number of iterations before benchmark the kernel. Default is 50.
+     -repeat    The number of iterations to benchmark the kernel. Default is 100.
+      -timer    The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.
+       -init    The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
+     -metric    Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
+   -pipeline    The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.     
+   -epilogue    The type of epilogue. Possible values are cshuffle or default. Default is csshuffle.
+      -pad_m    Whether pad or not in m direction. Possible values are true or false. Default is false. 
+      -pad_n    Whether pad or not in n direction. Possible values are true or false. Default is false. 
+      -pad_k    Whether pad or not in k direction. Possible values are true or false. Default is false. 

 Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json 
 ```
--- a/tile_engine/ops/gemm/gemm_host_api.hpp
+++ b/tile_engine/ops/gemm/gemm_host_api.hpp
@@ -225,25 +225,56 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
 inline auto create_args(int argc, char* argv[])
 {
    ck_tile::ArgParser arg_parser;
-    arg_parser.insert("m", "3840", "m dimension")
-        .insert("n", "4096", "n dimension")
-        .insert("k", "2048", "k dimension")
-        .insert("stride_a", "0", "Tensor A stride")
-        .insert("stride_b", "0", "Tensor B stride")
-        .insert("stride_c", "0", "Tensor C stride")
-        .insert("split_k", "1", "splitK value")
-        .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
-        .insert("warmup", "50", "number of iterations before benchmark the kernel")
-        .insert("repeat", "100", "number of iterations to benchmark the kernel")
-        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
-        .insert("metric", "0", "0:latency, 1:tflops, 2:bandwidth")
-        .insert("pipeline", "compv3", "compv3, compv4, mem")
-        .insert("scheduler", "intrawave", "intrawave, interwave")
-        .insert("epilogue", "cshuffle", "cshuffle, default")
-        .insert("pad_m", "false", "true, false")
-        .insert("pad_n", "false", "true, false")
-        .insert("pad_k", "false", "true, false");
+    arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
+        .insert("n", "4096", "The value for n dimension. Default is 4096.")
+        .insert("k", "2048", "The value for k dimension. Default is 2048.")
+        .insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
+        .insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
+        .insert("stride_c", "0", "The stride value for tensor C  Default is 0.")
+        .insert("split_k", "1", "The split value for k dimension. Default is 1.")
+        .insert("v",
+                "2",
+                "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
+                "for validation on GPU. Default is 2, validation on GPU.")
+        .insert(
+            "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
+        .insert(
+            "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
+        .insert(
+            "timer",
+            "gpu",
+            "The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.")
+        .insert("init",
+                "0",
+                "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
+                "for constant(1). Default is 0, random.")
+        .insert("metric",
+                "0",
+                "Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
+                "tflops, or 2 for bandwidth. Default is 0, latency.")
+        .insert(
+            "pipeline",
+            "compv3",
+            "The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.")
+        .insert("scheduler",
+                "The type of pipeline. Possible values are compv3, compv4 or mem. Default is "
+                "compv3.")
+        .insert(
+            "epilogue",
+            "cshuffle",
+            "The type of epilogue. Possible values are cshuffle or default. Default is csshuffle.")
+        .insert("pad_m",
+                "false",
+                "Whether pad or not in m direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_n",
+                "false",
+                "Whether pad or not in n direction. Possible values are true or false. Default is "
+                "false.")
+        .insert("pad_k",
+                "false",
+                "Whether pad or not in k direction. Possible values are true or false. Default is "
+                "false.");

    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);