From 50f0f55fbc364997e0e9cfa3e07bd4b9d590344a Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Tue, 8 Oct 2024 10:45:12 +0800 Subject: [PATCH] [CK_TILE] Update example README files & fix script compatibility issue (#1548) * Fix text alignment of ArgParser::print() * Update example README files * Clarify make-ck-dev.sh usage * Only keep some of the argument from '-?' output * Undo command line output changes in README * Only keep existing argument on doc and update description * Fix text alignment * Make cmake-ck-*.sh compatible with 'sh' command [ROCm/composable_kernel commit: 0c094daa7e3fcc3c4b4a6d75c85c31f2925f02a8] --- example/ck_tile/01_fmha/README.md | 45 ++++++++++++------------ example/ck_tile/02_layernorm2d/README.md | 3 +- example/ck_tile/03_gemm/README.md | 20 +++++++---- example/ck_tile/04_img2col/README.md | 3 +- include/ck_tile/host/arg_parser.hpp | 20 ++++++++--- script/cmake-ck-dev.sh | 3 +- script/cmake-ck-release.sh | 3 +- 7 files changed, 60 insertions(+), 37 deletions(-) diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md index 0803d54d66..c7ab296c3b 100644 --- a/example/ck_tile/01_fmha/README.md +++ b/example/ck_tile/01_fmha/README.md @@ -6,7 +6,8 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_fmha_fwd -j ``` This will result in an executable `build/bin/tile_example_fmha_fwd` @@ -23,7 +24,7 @@ There are 3 template parameters for this kernel template. To speed up compile time, we instantiate the kernels into separate file. In this way we can benefit from parallel building from CMake/Make system. This is achieved by `generate.py` script. Besides, you can look into this script to learn how to instantiate a kernel instance step by step, which is described in `FMHA_FWD_KERNEL_BODY` variable. ## executable -`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all supported args. Below is an example of the output (may subject to change) +`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all the arguments. Below is an example of the output (may subject to change) ``` args: -v weather do CPU validation or not (default:1) @@ -31,48 +32,48 @@ args: -b batch size (default:2) -h num of head, for q (default:8) -h_k num of head, for k/v, -1 means equal to h (default:-1) - if not equal to h, then this is GQA/MQA case + if not equal to h, then this is GQA/MQA case -s seqlen_q. if group-mode, means the average value of seqlen_q (default:3328) - total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary - also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode) - -s_k seqlen_k, -1 means equal to s (default:-1) + total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary + also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode) + -s_k seqlen_k (including new key/value), -1 means equal to s (default:-1) -d head dim for q, k (default:128) -d_v head dim for v, -1 means equal to d (default:-1) -scale_s scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0) - note when squant=1, this value will be modified by range_q/k + note when squant=1, this value will be modified by range_q/k -range_q per-tensor quantization range of q. used if squant=1. (default:16) -range_k per-tensor quantization range of k. used if squant=1. (default:16) -range_v per-tensor quantization range of v. used if squant=1. (default:16) -range_p per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1) -range_o per-tensor quantization range of o (p*v). used if squant=1. (default:16) -squant if using static quantization fusion or not. auto: fp8 will default use squant, other will not (default:auto) - 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O. - calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o + 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O. + calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o -iperm permute input (default:1) - if true, will be b*h*s*d, else b*s*h*d + if true, will be b*h*s*d, else b*s*h*d -operm permute output (default:1) -bias n or 0, no bias (default:n) - e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s - a(libi) or 2, alibi with 1*h. a:1, b*h + e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s + a(libi) or 2, alibi with 1*h. a:1, b*h -prec data type. fp16/bf16/fp8/bf8 (default:fp16) -mask 0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b') (default:0) - 't', top-left causal mask, 'b', bottom-r causal mask - 't:l,r', top-left sliding window attn(swa) with FA style left right size - 'b:l,r', bottom-r sliding window attn(swa) with FA style left right size - 'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa - 'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa - 'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now) + 't', top-left causal mask, 'b', bottom-r causal mask + 't:l,r', top-left sliding window attn(swa) with FA style left right size + 'b:l,r', bottom-r sliding window attn(swa) with FA style left right size + 'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa + 'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa + 'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now) -vlayout r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r) -lse 0 not store lse, 1 store lse (default:0) -kname if set to 1 will print kernel name (default:0) -init init method. ui, uniform random int, ni, normalized random int (default:uf) - uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization + uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization -seed random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939) + -drop_seed seed for random number generator (default:1) +-drop_offset offset for random number generator (default:0) + -drop_prefs seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0) -warmup number of iterations before benchmark the kernel (default:5) -repeat number of iterations to benchmark the kernel (default:20) - -drop_seed seed for the random number generator for the dropout layer, default is 1 --drop_offset offset for the dropout layer which is used during random number generation, default is 0 - -drop_prefs flag to indicate `drop_seed` and `drop_offset` values if present on the GPU, default is 0, 0 - host, 1 - GPU ``` Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case. Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md index 433dad04e6..66b16c1b7f 100644 --- a/example/ck_tile/02_layernorm2d/README.md +++ b/example/ck_tile/02_layernorm2d/README.md @@ -6,7 +6,8 @@ This folder contains example for Layernorm2D forward using ck_tile tile-programm ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_layernorm2d_fwd -j ``` This will result in an executable `build/bin/tile_example_layernorm2d_fwd` diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md index 00303bf62c..aacbdf6863 100644 --- a/example/ck_tile/03_gemm/README.md +++ b/example/ck_tile/03_gemm/README.md @@ -6,7 +6,8 @@ This folder contains example for GEMM using ck_tile tile-programming implementat ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_gemm_basic -j ``` This will result in an executable `build/bin/tile_example_gemm_basic` @@ -14,10 +15,17 @@ This will result in an executable `build/bin/tile_example_gemm_basic` ## example ``` args: - -m m dimension (default:3328) - -n m dimension (default:4096) + -b batch size (default:1) + -m m dimension (default:1024) + -n n dimension (default:2048) -k k dimension (default:64) - -e epsilon (default:1e-5) - -v cpu validation or not (default:1) - -prec precision (default:fp16) + -stride_a Tensor A stride (default:0) + -stride_b Tensor B stride (default:0) + -stride_c Tensor C stride (default:0) + -v 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2) + -e Absolute error tolerance (default:1e-5) + -prec data type. fp16/bf16/fp8/bf8 (default:fp16) + -warmup number of iterations before benchmark the kernel (default:10) + -repeat number of iterations to benchmark the kernel (default:100) + -timer gpu:gpu timer, cpu:cpu timer (default:gpu) ``` diff --git a/example/ck_tile/04_img2col/README.md b/example/ck_tile/04_img2col/README.md index 6ae2cea5e5..df5c51a9c0 100644 --- a/example/ck_tile/04_img2col/README.md +++ b/example/ck_tile/04_img2col/README.md @@ -6,7 +6,8 @@ This folder contains example for Image to Column using ck_tile tile-programming ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_img2col -j ``` This will result in an executable `build/bin/tile_example_img2col` diff --git a/include/ck_tile/host/arg_parser.hpp b/include/ck_tile/host/arg_parser.hpp index 5f8a78b4c9..3765156df0 100644 --- a/include/ck_tile/host/arg_parser.hpp +++ b/include/ck_tile/host/arg_parser.hpp @@ -50,12 +50,22 @@ class ArgParser } return *this; } - void print() + void print() const { + // find max key length + std::string::size_type max_key_length = 11; + for(auto& key : keys) + { + if(max_key_length < key.length()) + { + max_key_length = key.length(); + } + } + printf("args:\n"); for(auto& key : keys) { - auto value = input_map[key]; + auto value = input_map.at(key); std::vector help_text_lines; size_t pos = 0; for(size_t next_pos = value.help_text.find('\n', pos); next_pos != std::string::npos;) @@ -69,8 +79,7 @@ class ArgParser std::string(value.help_text.begin() + pos, value.help_text.end())); std::string default_value = std::string("(default:") + value.value + std::string(")"); - - std::cout << std::setw(2) << std::setw(12 - value.name.length()) << "-" << key + std::cout << std::setw(1 + max_key_length - value.name.length()) << "-" << key << std::setw(4) << " " << help_text_lines[0] << " " << default_value << std::endl; @@ -78,7 +87,8 @@ class ArgParser help_next_line != help_text_lines.end(); ++help_next_line) { - std::cout << std::setw(17) << " " << *help_next_line << std::endl; + std::cout << std::setw(1 + max_key_length + 4) << " " << *help_next_line + << std::endl; } } } diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 5dae86089a..4097ca98f6 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1 if [ $# -ge 2 ] ; then GPU_TARGETS=$2 - REST_ARGS=${@:3} + shift 2 + REST_ARGS=$@ else GPU_TARGETS="gfx908;gfx90a;gfx940" REST_ARGS= diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh index f65ec610dd..5e3f7faac2 100755 --- a/script/cmake-ck-release.sh +++ b/script/cmake-ck-release.sh @@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1 if [ $# -ge 2 ] ; then GPU_TARGETS=$2 - REST_ARGS=${@:3} + shift 2 + REST_ARGS=$@ else GPU_TARGETS="gfx908;gfx90a;gfx940" REST_ARGS=