This commit is contained in:
Yanxing-Shi
2025-05-06 08:29:11 +00:00
parent 45a74f2b24
commit bc72ec4cfb
2 changed files with 68 additions and 38 deletions

View File

@@ -20,25 +20,24 @@ make tile_engine_gemm -j
## tile_engine_gemm inputs
```
-m m dimension (default:3840)
-n n dimension (default:4096)
-k k dimension (default:2048)
-stride_a Tensor A stride (default:0)
-stride_b Tensor B stride (default:0)
-stride_c Tensor C stride (default:0)
-split_k SplitK value (default:1)
-v No validation: 0, Validation on CPU: 1, Validation on GPU: 2 (default:2)
-metric The metric value of kernel performance - latency: 0, tflops: 1, bandwidth: 2 (default:0)
-warmup Number of iterations before benchmark the kernel (default:50)
-repeat Number of iterations to benchmark the kernel (default:100)
-timer gpu:gpu timer, cpu:cpu timer (default:gpu)
-init Value for initializing tensor - random: 0, linear: 1, constant(1): 2 (default:0)
-pipeline possible values are: compv3, compv4, mem (default:compv3)
-scheduler possible values are: intrawave, interwave (default:intrawave)
-epilogue possible values are: cshuffle, default (default:cshuffle)
-pad_m Pad in m direction - true/false (default:false)
-pad_n Pad in n direction - true/false (default:false)
-pad_k Pad in k direction - true/false (default:false)
-m The value for m dimension. Default is 3840.
-n The value for n dimension. Default is 4096.
-k The value for k dimension. Default is 2048.
-stride_a The stride value for tensor A. Default is 0.
-stride_b The stride value for tensor B. Default is 0.
-stride_c The stride value for tensor C Default is 0.
-split_k The split value for k dimension. Default is 1.
-v The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 for validation on GPU. Default is 2, validation on GPU.
-warmup The number of iterations before benchmark the kernel. Default is 50.
-repeat The number of iterations to benchmark the kernel. Default is 100.
-timer The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.
-init The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 for constant(1). Default is 0, random.
-metric Metric with which to measure kernel performance. Set to 0 for latency, 1 for tflops, or 2 for bandwidth. Default is 0, latency.
-pipeline The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.
-epilogue The type of epilogue. Possible values are cshuffle or default. Default is csshuffle.
-pad_m Whether pad or not in m direction. Possible values are true or false. Default is false.
-pad_n Whether pad or not in n direction. Possible values are true or false. Default is false.
-pad_k Whether pad or not in k direction. Possible values are true or false. Default is false.
Note: pipeline, scheduler, epilogue, pad_m, pad_n, pad_k should be one of the options specified in instance_combination.json
```

View File

@@ -225,25 +225,56 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
inline auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("m", "3840", "m dimension")
.insert("n", "4096", "n dimension")
.insert("k", "2048", "k dimension")
.insert("stride_a", "0", "Tensor A stride")
.insert("stride_b", "0", "Tensor B stride")
.insert("stride_c", "0", "Tensor C stride")
.insert("split_k", "1", "splitK value")
.insert("v", "2", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
.insert("warmup", "50", "number of iterations before benchmark the kernel")
.insert("repeat", "100", "number of iterations to benchmark the kernel")
.insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
.insert("init", "0", "0:random, 1:linear, 2:constant(1)")
.insert("metric", "0", "0:latency, 1:tflops, 2:bandwidth")
.insert("pipeline", "compv3", "compv3, compv4, mem")
.insert("scheduler", "intrawave", "intrawave, interwave")
.insert("epilogue", "cshuffle", "cshuffle, default")
.insert("pad_m", "false", "true, false")
.insert("pad_n", "false", "true, false")
.insert("pad_k", "false", "true, false");
arg_parser.insert("m", "3840", "The value for m dimension. Default is 3840.")
.insert("n", "4096", "The value for n dimension. Default is 4096.")
.insert("k", "2048", "The value for k dimension. Default is 2048.")
.insert("stride_a", "0", "The stride value for tensor A. Default is 0.")
.insert("stride_b", "0", "The stride value for tensor B. Default is 0.")
.insert("stride_c", "0", "The stride value for tensor C Default is 0.")
.insert("split_k", "1", "The split value for k dimension. Default is 1.")
.insert("v",
"2",
"The type of validation. Set to 0 for no validation, 1 for validation on CPU, or 2 "
"for validation on GPU. Default is 2, validation on GPU.")
.insert(
"warmup", "50", "The number of iterations before benchmark the kernel. Default is 50.")
.insert(
"repeat", "100", "The number of iterations to benchmark the kernel. Default is 100.")
.insert(
"timer",
"gpu",
"The type of timer. Possible values are gpu timer or cpu timer. Default is gpu timer.")
.insert("init",
"0",
"The method of tensor initialization. Set to 0 for random, to 1 for linear, or 2 "
"for constant(1). Default is 0, random.")
.insert("metric",
"0",
"Metric with which to measure kernel performance. Set to 0 for latency, 1 for "
"tflops, or 2 for bandwidth. Default is 0, latency.")
.insert(
"pipeline",
"compv3",
"The type of pipeline. Possible values are compv3, compv4 or mem. Default is compv3.")
.insert("scheduler",
"The type of pipeline. Possible values are compv3, compv4 or mem. Default is "
"compv3.")
.insert(
"epilogue",
"cshuffle",
"The type of epilogue. Possible values are cshuffle or default. Default is csshuffle.")
.insert("pad_m",
"false",
"Whether pad or not in m direction. Possible values are true or false. Default is "
"false.")
.insert("pad_n",
"false",
"Whether pad or not in n direction. Possible values are true or false. Default is "
"false.")
.insert("pad_k",
"false",
"Whether pad or not in k direction. Possible values are true or false. Default is "
"false.");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);