From bc72ec4cfbe10f28f6a6447dd73736268f98dedb Mon Sep 17 00:00:00 2001 From: Yanxing-Shi Date: Tue, 6 May 2025 08:29:11 +0000 Subject: [PATCH] fix doc --- tile_engine/ops/gemm/README.md | 37 +++++++------- tile_engine/ops/gemm/gemm_host_api.hpp | 69 +++++++++++++++++++------- 2 files changed, 68 insertions(+), 38 deletions(-) diff --git a/tile_engine/ops/gemm/README.md b/tile_engine/ops/gemm/README.md index 4addfa0d5b..0ac3629ba6 100644 --- a/tile_engine/ops/gemm/README.md +++ b/tile_engine/ops/gemm/README.md @@ -20,25 +20,24 @@ make tile_engine_gemm -j ## tile_engine_gemm inputs ``` - -m m dimension (default:3840) - -n n dimension (default:4096) - -k k dimension (default:2048) - -stride_a Tensor A stride (default:0) - -stride_b Tensor B stride (default:0) - -stride_c Tensor C stride (default:0) - -split_k SplitK value (default:1) - -v No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2) - -metric The metric value of kernel performance - latency: 0, tflops: 1, bandwidth: 2 (default:0) - -warmup Number of iterations before benchmark the kernel (default:50) - -repeat Number of iterations to benchmark the kernel (default:100) - -timer gpu:gpu timer, cpu:cpu timer (default:gpu) - -init Value for initializing tensor - random: 0, linear: 1, constant(1): 2 (default:0) - -pipeline possible values are: compv3, compv4, mem (default:compv3) - -scheduler possible values are: intrawave, interwave (default:intrawave) - -epilogue possible values are: cshuffle, default (default:cshuffle) - -pad_m Pad in m direction - true/false (default:false) - -pad_n Pad in n direction - true/false (default:false) - -pad_k Pad in k direction - true/false (default:false) + -m The value for m dimension. Default is 3840. + -n The value for n dimension. Default is 4096. + -k The value for k dimension. Default is 2048. + -stride_a The stride value for tensor A. Default is 0. + -stride_b The stride value for tensor B. Default is 0. + -stride_c The stride value for tensor C Default is 0. + -split_k The split value for k dimension. Default is 1. + -v The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU. + -warmup The number of iterations before benchmark the kernel. Default is 50. + -repeat The number of iterations to benchmark the kernel. Default is 100. + -timer The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer. + -init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random. + -metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency. + -pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3. + -epilogue The type of epilogue. Possible values are cshuffle or default. Default is csshuffle. + -pad_m Whether pad or not in m direction. Possible values are true or false. Default is false. + -pad_n Whether pad or not in n direction. Possible values are true or false. Default is false. + -pad_k Whether pad or not in k direction. Possible values are true or false. Default is false. Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json ``` diff --git a/tile_engine/ops/gemm/gemm_host_api.hpp b/tile_engine/ops/gemm/gemm_host_api.hpp index 6574a12dbb..356a42bab7 100644 --- a/tile_engine/ops/gemm/gemm_host_api.hpp +++ b/tile_engine/ops/gemm/gemm_host_api.hpp @@ -225,25 +225,56 @@ auto calculate_rtol_atol(const ck_tile::index_t K, inline auto create_args(int argc, char* argv[]) { ck_tile::ArgParser arg_parser; - arg_parser.insert("m", "3840", "m dimension") - .insert("n", "4096", "n dimension") - .insert("k", "2048", "k dimension") - .insert("stride_a", "0", "Tensor A stride") - .insert("stride_b", "0", "Tensor B stride") - .insert("stride_c", "0", "Tensor C stride") - .insert("split_k", "1", "splitK value") - .insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU") - .insert("warmup", "50", "number of iterations before benchmark the kernel") - .insert("repeat", "100", "number of iterations to benchmark the kernel") - .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer") - .insert("init", "0", "0:random, 1:linear, 2:constant(1)") - .insert("metric", "0", "0:latency, 1:tflops, 2:bandwidth") - .insert("pipeline", "compv3", "compv3, compv4, mem") - .insert("scheduler", "intrawave", "intrawave, interwave") - .insert("epilogue", "cshuffle", "cshuffle, default") - .insert("pad_m", "false", "true, false") - .insert("pad_n", "false", "true, false") - .insert("pad_k", "false", "true, false"); + arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.") + .insert("n", "4096", "The value for n dimension. Default is 4096.") + .insert("k", "2048", "The value for k dimension. Default is 2048.") + .insert("stride_a", "0", "The stride value for tensor A. Default is 0.") + .insert("stride_b", "0", "The stride value for tensor B. Default is 0.") + .insert("stride_c", "0", "The stride value for tensor C Default is 0.") + .insert("split_k", "1", "The split value for k dimension. Default is 1.") + .insert("v", + "2", + "The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 " + "for validation on GPU. Default is 2, validation on GPU.") + .insert( + "warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.") + .insert( + "repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.") + .insert( + "timer", + "gpu", + "The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.") + .insert("init", + "0", + "The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 " + "for constant(1). Default is 0, random.") + .insert("metric", + "0", + "Metric with which to measure kernel performance. Set to 0 for latency, 1 for " + "tflops, or 2 for bandwidth. Default is 0, latency.") + .insert( + "pipeline", + "compv3", + "The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.") + .insert("scheduler", + "The type of pipeline. Possible values are compv3, compv4 or mem. Default is " + "compv3.") + .insert( + "epilogue", + "cshuffle", + "The type of epilogue. Possible values are cshuffle or default. Default is csshuffle.") + .insert("pad_m", + "false", + "Whether pad or not in m direction. Possible values are true or false. Default is " + "false.") + .insert("pad_n", + "false", + "Whether pad or not in n direction. Possible values are true or false. Default is " + "false.") + .insert("pad_k", + "false", + "Whether pad or not in k direction. Possible values are true or false. Default is " + "false."); bool result = arg_parser.parse(argc, argv); return std::make_tuple(result, arg_parser);