From 74d68e3b991dbfff7f14881a572bc77f4954c4fc Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Tue, 8 Oct 2024 10:44:34 +0800 Subject: [PATCH 1/6] [CK_TILE] Simplify the codes in splitkv_combine pipeline (#1549) * Simplify the codes in splitkv_combine pipeline * Always set kPadSeqLenK=true for fmha splitkv kernels * Change in Oacc Alignment and TileDistribution to be more adaptable to tile sizes --------- Co-authored-by: Po Yen Chen --- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 4 +- ...lock_fmha_fwd_splitkv_combine_pipeline.hpp | 90 ++++++++++--------- ...plitkv_combine_pipeline_default_policy.hpp | 23 +++-- 3 files changed, 67 insertions(+), 50 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index ba826c8fb3..82cf3a5ab2 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -600,8 +600,8 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> # TODO: use async pipeline when compiler is more stable if hdim == 256 or hdim in [32, 64, 128]: # if True: - pipelines.append(Pipeline('qr', 'row', 'f', 'f', 'f', 'f', bias, lse, squant, pagedkv, mask)) - pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) + pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', bias, lse, squant, pagedkv, mask)) pipelines.append(Pipeline('qr', 'row', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) pipelines.append(Pipeline('qr', 'col', 't', 't', 't', 't', bias, lse, squant, pagedkv, mask)) diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp index 842090afbe..1afe0feab3 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline.hpp @@ -172,22 +172,27 @@ struct BlockFmhaFwdSplitKVCombinePipeline lse_accum, sequence<1>{}, f_max, -numeric::infinity()); block_tile_reduce_sync(lse_max, f_max, bool_constant{}); - static const auto get_validated_m = [](LSEDataType raw_m) { - return raw_m == -numeric::infinity() ? type_convert(0.f) - : raw_m; - }; - decltype(lse_accum) lse_exp; { constexpr auto spans = decltype(lse_exp)::get_distributed_spans(); sweep_tile_span(spans[number<0>{}], [&](auto idx0) { constexpr auto i_idx = make_tuple(idx0); - sweep_tile_span(spans[number<1>{}], [&](auto idx1) { - constexpr auto i_j_idx = make_tuple(idx0, idx1); + if(lse_max[i_idx] == -numeric::infinity()) + { + sweep_tile_span(spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); - lse_exp(i_j_idx) = - ck_tile::exp(lse_accum(i_j_idx) - get_validated_m(lse_max(i_idx))); - }); + lse_exp(i_j_idx) = ck_tile::type_convert(0.0f); + }); + } + else + { + sweep_tile_span(spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); + + lse_exp(i_j_idx) = ck_tile::exp(lse_accum(i_j_idx) - lse_max(i_idx)); + }); + } }); } @@ -201,15 +206,10 @@ struct BlockFmhaFwdSplitKVCombinePipeline sweep_tile_span(spans[number<0>{}], [&](auto idx0) { constexpr auto i_idx = make_tuple(idx0); - if(lse_sum(i_idx) == 0.f || lse_sum(i_idx) != lse_sum(i_idx)) - { - lse_logsum(i_idx) = numeric::infinity(); - } + if(lse_sum[i_idx] == ck_tile::type_convert(0.0f)) + lse_logsum(i_idx) = -numeric::infinity(); else - { - lse_logsum(i_idx) = - ck_tile::log(lse_sum(i_idx)) + get_validated_m(lse_max(i_idx)); - } + lse_logsum(i_idx) = ck_tile::log(lse_sum(i_idx)) + lse_max(i_idx); }); } @@ -218,37 +218,47 @@ struct BlockFmhaFwdSplitKVCombinePipeline constexpr auto spans = decltype(lse_accum)::get_distributed_spans(); sweep_tile_span(spans[number<0>{}], [&](auto idx0) { constexpr auto i_idx = make_tuple(idx0); - sweep_tile_span(spans[number<1>{}], [&](auto idx1) { - constexpr auto i_j_idx = make_tuple(idx0, idx1); + if(lse_logsum(i_idx) == -numeric::infinity()) + { + sweep_tile_span(spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); - const auto x_indices = get_x_indices_from_distributed_indices( - lse_accum.get_tile_distribution(), i_j_idx); + const auto x_indices = get_x_indices_from_distributed_indices( + lse_accum.get_tile_distribution(), i_j_idx); - const auto col = x_indices.at(number<1>{}); - if(col < num_splits) - { - const auto row = x_indices.at(number<0>{}); + const auto col = x_indices.at(number<1>{}); + if(col < num_splits) + { + const auto row = x_indices.at(number<0>{}); - lse_acc_lds(row, col) = - ck_tile::exp(lse_accum(i_j_idx) - lse_logsum(i_idx)); - } - }); + lse_acc_lds(row, col) = ck_tile::type_convert(0.0f); + } + }); + } + else + { + sweep_tile_span(spans[number<1>{}], [&](auto idx1) { + constexpr auto i_j_idx = make_tuple(idx0, idx1); + + const auto x_indices = get_x_indices_from_distributed_indices( + lse_accum.get_tile_distribution(), i_j_idx); + + const auto col = x_indices.at(number<1>{}); + if(col < num_splits) + { + const auto row = x_indices.at(number<0>{}); + + lse_acc_lds(row, col) = + ck_tile::exp(lse_accum(i_j_idx) - lse_logsum(i_idx)); + } + }); + } }); } block_sync_lds(); if constexpr(kStoreLSE) { - constexpr auto spans = decltype(lse_logsum)::get_distributed_spans(); - sweep_tile_span(spans[number<0>{}], [&](auto idx0) { - constexpr auto i_idx = make_tuple(idx0); - - if(lse_logsum(i_idx) == numeric::infinity()) - { - lse_logsum(i_idx) = -numeric::infinity(); - } - }); - store_tile(lse_dram_window_tmp, tile_elementwise_in(lse_element_func, lse_logsum)); } diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp index 2eb092f055..3327d4af87 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_combine_pipeline_default_policy.hpp @@ -21,14 +21,23 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentOacc() { using OaccDataType = remove_cvref_t; - return 16 / sizeof(OaccDataType); + + constexpr index_t kBlockSize = Problem::kBlockSize; + constexpr index_t kMPerBlock = Problem::kM0; + constexpr index_t kNPerBlock = Problem::kN1; + + constexpr index_t M1 = kBlockSize / get_warp_size(); + constexpr index_t M2 = min(kMPerBlock / M1, get_warp_size()); + constexpr index_t N0 = get_warp_size() / M2; + constexpr index_t N1 = kNPerBlock / N0; + + return min(N1, static_cast(16 / sizeof(OaccDataType))); } template CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentO() { - using ODataType = remove_cvref_t; - return 16 / sizeof(ODataType); + return GetAlignmentOacc(); } template @@ -150,16 +159,14 @@ struct BlockFmhaFwdSplitKVCombinePipelineDefaultPolicy template CK_TILE_HOST_DEVICE static constexpr auto MakeOaccDramTileDistribution() { - using OaccDataType = remove_cvref_t; - constexpr index_t kBlockSize = Problem::kBlockSize; constexpr index_t kMPerBlock = Problem::kM0; constexpr index_t kNPerBlock = Problem::kN1; - constexpr index_t N1 = 16 / sizeof(OaccDataType); - constexpr index_t N0 = kNPerBlock / N1; - constexpr index_t M2 = get_warp_size() / N0; constexpr index_t M1 = kBlockSize / get_warp_size(); + constexpr index_t M2 = min(kMPerBlock / M1, get_warp_size()); + constexpr index_t N0 = get_warp_size() / M2; + constexpr index_t N1 = kNPerBlock / N0; constexpr index_t M0 = kMPerBlock / (M2 * M1); return make_static_tile_distribution( From 0c094daa7e3fcc3c4b4a6d75c85c31f2925f02a8 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Tue, 8 Oct 2024 10:45:12 +0800 Subject: [PATCH 2/6] [CK_TILE] Update example README files & fix script compatibility issue (#1548) * Fix text alignment of ArgParser::print() * Update example README files * Clarify make-ck-dev.sh usage * Only keep some of the argument from '-?' output * Undo command line output changes in README * Only keep existing argument on doc and update description * Fix text alignment * Make cmake-ck-*.sh compatible with 'sh' command --- example/ck_tile/01_fmha/README.md | 45 ++++++++++++------------ example/ck_tile/02_layernorm2d/README.md | 3 +- example/ck_tile/03_gemm/README.md | 20 +++++++---- example/ck_tile/04_img2col/README.md | 3 +- include/ck_tile/host/arg_parser.hpp | 20 ++++++++--- script/cmake-ck-dev.sh | 3 +- script/cmake-ck-release.sh | 3 +- 7 files changed, 60 insertions(+), 37 deletions(-) diff --git a/example/ck_tile/01_fmha/README.md b/example/ck_tile/01_fmha/README.md index 0803d54d66..c7ab296c3b 100644 --- a/example/ck_tile/01_fmha/README.md +++ b/example/ck_tile/01_fmha/README.md @@ -6,7 +6,8 @@ This folder contains example for fmha(fused multi-head attention) using ck_tile ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_fmha_fwd -j ``` This will result in an executable `build/bin/tile_example_fmha_fwd` @@ -23,7 +24,7 @@ There are 3 template parameters for this kernel template. To speed up compile time, we instantiate the kernels into separate file. In this way we can benefit from parallel building from CMake/Make system. This is achieved by `generate.py` script. Besides, you can look into this script to learn how to instantiate a kernel instance step by step, which is described in `FMHA_FWD_KERNEL_BODY` variable. ## executable -`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all supported args. Below is an example of the output (may subject to change) +`tile_example_fmha_fwd` is the example executable, implemented in `fmha_fwd.cpp`. You can type `./bin/tile_example_fmha_fwd -?` to list all the arguments. Below is an example of the output (may subject to change) ``` args: -v weather do CPU validation or not (default:1) @@ -31,48 +32,48 @@ args: -b batch size (default:2) -h num of head, for q (default:8) -h_k num of head, for k/v, -1 means equal to h (default:-1) - if not equal to h, then this is GQA/MQA case + if not equal to h, then this is GQA/MQA case -s seqlen_q. if group-mode, means the average value of seqlen_q (default:3328) - total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary - also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode) - -s_k seqlen_k, -1 means equal to s (default:-1) + total_seqlen_q = seqlen_q * batch, and seqlen_q per batch may vary + also with "-s=s0,s1,s2..." comma seperated int to set per batch seqlen(group-mode) + -s_k seqlen_k (including new key/value), -1 means equal to s (default:-1) -d head dim for q, k (default:128) -d_v head dim for v, -1 means equal to d (default:-1) -scale_s scale factor of S. 0 means equal to 1/sqrt(hdim). (default:0) - note when squant=1, this value will be modified by range_q/k + note when squant=1, this value will be modified by range_q/k -range_q per-tensor quantization range of q. used if squant=1. (default:16) -range_k per-tensor quantization range of k. used if squant=1. (default:16) -range_v per-tensor quantization range of v. used if squant=1. (default:16) -range_p per-tensor quantization range of p [e^(s-m)]. used if squant=1. (default:1) -range_o per-tensor quantization range of o (p*v). used if squant=1. (default:16) -squant if using static quantization fusion or not. auto: fp8 will default use squant, other will not (default:auto) - 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O. - calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o + 0: no static quant(not implemented) 1: apply scale_p and scale_o with respect to P and O. + calculate scale_s, scale_p, scale_o according to range_q, range_k, range_v, range_p, range_o -iperm permute input (default:1) - if true, will be b*h*s*d, else b*s*h*d + if true, will be b*h*s*d, else b*s*h*d -operm permute output (default:1) -bias n or 0, no bias (default:n) - e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s - a(libi) or 2, alibi with 1*h. a:1, b*h + e(lementwise) or 1, elementwise bias with 1*1*s*s. e:1, 1*h*s*s. e:2, b*h*s*s + a(libi) or 2, alibi with 1*h. a:1, b*h -prec data type. fp16/bf16/fp8/bf8 (default:fp16) -mask 0: no mask, 1: top-left(same as 't'), 2:bottom-right(same as 'b') (default:0) - 't', top-left causal mask, 'b', bottom-r causal mask - 't:l,r', top-left sliding window attn(swa) with FA style left right size - 'b:l,r', bottom-r sliding window attn(swa) with FA style left right size - 'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa - 'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa - 'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now) + 't', top-left causal mask, 'b', bottom-r causal mask + 't:l,r', top-left sliding window attn(swa) with FA style left right size + 'b:l,r', bottom-r sliding window attn(swa) with FA style left right size + 'xt:window_size', xformer style masking from top-left, window_size negative is causal, positive is swa + 'xb:window_size', xformer style masking from bottom-r, window_size negative is causal, positive is swa + 'g:y,x', generic attention mask coordinate with y/x size (only debug purpose for now) -vlayout r for row-major(seqlen*hdim), c for col-major(hdim*seqlen) (default:r) -lse 0 not store lse, 1 store lse (default:0) -kname if set to 1 will print kernel name (default:0) -init init method. ui, uniform random int, ni, normalized random int (default:uf) - uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization + uf, uniform random float, nf, normalized random float, tf, trig float, uf:q, quantization -seed random seed used for initializing input tensors. 0 for non-deterministic seed (default:11939) + -drop_seed seed for random number generator (default:1) +-drop_offset offset for random number generator (default:0) + -drop_prefs seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0) -warmup number of iterations before benchmark the kernel (default:5) -repeat number of iterations to benchmark the kernel (default:20) - -drop_seed seed for the random number generator for the dropout layer, default is 1 --drop_offset offset for the dropout layer which is used during random number generation, default is 0 - -drop_prefs flag to indicate `drop_seed` and `drop_offset` values if present on the GPU, default is 0, 0 - host, 1 - GPU ``` Example 1: `./bin/tile_example_fmha_fwd -b=1 -h=16 -s=16384 -d=128` will run a fmha case with batch=1, nhead=16, sequence length=16384, hdim=128, fp16 case. Example 2: `./bin/tile_example_fmha_fwd -b=1 -h=8 -s=16384 -d=64 -drop_prefs=1 -drop_seed=10 -drop_offset=1234` will run a fmha case with diff --git a/example/ck_tile/02_layernorm2d/README.md b/example/ck_tile/02_layernorm2d/README.md index 433dad04e6..66b16c1b7f 100644 --- a/example/ck_tile/02_layernorm2d/README.md +++ b/example/ck_tile/02_layernorm2d/README.md @@ -6,7 +6,8 @@ This folder contains example for Layernorm2D forward using ck_tile tile-programm ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_layernorm2d_fwd -j ``` This will result in an executable `build/bin/tile_example_layernorm2d_fwd` diff --git a/example/ck_tile/03_gemm/README.md b/example/ck_tile/03_gemm/README.md index 00303bf62c..aacbdf6863 100644 --- a/example/ck_tile/03_gemm/README.md +++ b/example/ck_tile/03_gemm/README.md @@ -6,7 +6,8 @@ This folder contains example for GEMM using ck_tile tile-programming implementat ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_gemm_basic -j ``` This will result in an executable `build/bin/tile_example_gemm_basic` @@ -14,10 +15,17 @@ This will result in an executable `build/bin/tile_example_gemm_basic` ## example ``` args: - -m m dimension (default:3328) - -n m dimension (default:4096) + -b batch size (default:1) + -m m dimension (default:1024) + -n n dimension (default:2048) -k k dimension (default:64) - -e epsilon (default:1e-5) - -v cpu validation or not (default:1) - -prec precision (default:fp16) + -stride_a Tensor A stride (default:0) + -stride_b Tensor B stride (default:0) + -stride_c Tensor C stride (default:0) + -v 0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2) + -e Absolute error tolerance (default:1e-5) + -prec data type. fp16/bf16/fp8/bf8 (default:fp16) + -warmup number of iterations before benchmark the kernel (default:10) + -repeat number of iterations to benchmark the kernel (default:100) + -timer gpu:gpu timer, cpu:cpu timer (default:gpu) ``` diff --git a/example/ck_tile/04_img2col/README.md b/example/ck_tile/04_img2col/README.md index 6ae2cea5e5..df5c51a9c0 100644 --- a/example/ck_tile/04_img2col/README.md +++ b/example/ck_tile/04_img2col/README.md @@ -6,7 +6,8 @@ This folder contains example for Image to Column using ck_tile tile-programming ``` # in the root of ck_tile mkdir build && cd build -sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +# you can replace with the appropriate architecture (for example gfx90a or gfx942) or leave it blank +sh ../script/cmake-ck-dev.sh ../ make tile_example_img2col -j ``` This will result in an executable `build/bin/tile_example_img2col` diff --git a/include/ck_tile/host/arg_parser.hpp b/include/ck_tile/host/arg_parser.hpp index 5f8a78b4c9..3765156df0 100644 --- a/include/ck_tile/host/arg_parser.hpp +++ b/include/ck_tile/host/arg_parser.hpp @@ -50,12 +50,22 @@ class ArgParser } return *this; } - void print() + void print() const { + // find max key length + std::string::size_type max_key_length = 11; + for(auto& key : keys) + { + if(max_key_length < key.length()) + { + max_key_length = key.length(); + } + } + printf("args:\n"); for(auto& key : keys) { - auto value = input_map[key]; + auto value = input_map.at(key); std::vector help_text_lines; size_t pos = 0; for(size_t next_pos = value.help_text.find('\n', pos); next_pos != std::string::npos;) @@ -69,8 +79,7 @@ class ArgParser std::string(value.help_text.begin() + pos, value.help_text.end())); std::string default_value = std::string("(default:") + value.value + std::string(")"); - - std::cout << std::setw(2) << std::setw(12 - value.name.length()) << "-" << key + std::cout << std::setw(1 + max_key_length - value.name.length()) << "-" << key << std::setw(4) << " " << help_text_lines[0] << " " << default_value << std::endl; @@ -78,7 +87,8 @@ class ArgParser help_next_line != help_text_lines.end(); ++help_next_line) { - std::cout << std::setw(17) << " " << *help_next_line << std::endl; + std::cout << std::setw(1 + max_key_length + 4) << " " << *help_next_line + << std::endl; } } } diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 5dae86089a..4097ca98f6 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1 if [ $# -ge 2 ] ; then GPU_TARGETS=$2 - REST_ARGS=${@:3} + shift 2 + REST_ARGS=$@ else GPU_TARGETS="gfx908;gfx90a;gfx940" REST_ARGS= diff --git a/script/cmake-ck-release.sh b/script/cmake-ck-release.sh index f65ec610dd..5e3f7faac2 100755 --- a/script/cmake-ck-release.sh +++ b/script/cmake-ck-release.sh @@ -7,7 +7,8 @@ MY_PROJECT_SOURCE=$1 if [ $# -ge 2 ] ; then GPU_TARGETS=$2 - REST_ARGS=${@:3} + shift 2 + REST_ARGS=$@ else GPU_TARGETS="gfx908;gfx90a;gfx940" REST_ARGS= From aa932445eae1d2d8a6abb6c8a78c3fc41489ecf9 Mon Sep 17 00:00:00 2001 From: Rostyslav Geyyer <46627076+geyyer@users.noreply.github.com> Date: Tue, 8 Oct 2024 11:05:28 -0500 Subject: [PATCH 3/6] Add a gpu gemm reference kernel (#1528) * Add a gpu gemm reference kernel * Switch to gpu reference in gemm examples * Remove redundant arguments * Update all related examples * Update more examples * Try less threads per block * Try even less threads per block * Add support for all matrix layouts * Increase block size * Clean up * Remove hardcoded strides * Clean up * Try a column-major case * Revert back to row-major * Run both CPU and GPU veriffication --------- Co-authored-by: Po Yen Chen --- example/01_gemm/common.hpp | 33 +-- example/01_gemm/gemm_dl_fp16.cpp | 13 +- example/01_gemm/gemm_dl_fp32.cpp | 13 +- example/01_gemm/gemm_dl_int8.cpp | 13 +- example/01_gemm/gemm_dpp_fp16.cpp | 5 +- example/01_gemm/gemm_wmma_fp16.cpp | 13 +- example/01_gemm/gemm_xdl_bf16.cpp | 16 +- example/01_gemm/gemm_xdl_bf16_rtn.cpp | 16 +- example/01_gemm/gemm_xdl_fp16.cpp | 13 +- example/01_gemm/gemm_xdl_fp16_fp8.cpp | 13 +- example/01_gemm/gemm_xdl_fp16_v2.cpp | 13 +- example/01_gemm/gemm_xdl_fp64.cpp | 13 +- example/01_gemm/gemm_xdl_fp8.cpp | 14 + example/01_gemm/gemm_xdl_fp8_bf8.cpp | 13 +- example/01_gemm/gemm_xdl_int8.cpp | 13 +- .../01_gemm/gemm_xdl_lds_direct_load_fp16.cpp | 13 +- .../01_gemm/gemm_xdl_lds_direct_load_fp32.cpp | 13 +- example/01_gemm/gemm_xdl_streamk.cpp | 13 +- example/01_gemm/gemm_xdl_wavelet_fp16.cpp | 13 +- example/01_gemm/run_gemm_example.inc | 46 +++- .../gpu/reference_gemm.hpp | 245 ++++++++++++++++++ 21 files changed, 518 insertions(+), 39 deletions(-) create mode 100644 library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp diff --git a/example/01_gemm/common.hpp b/example/01_gemm/common.hpp index 3d8f4565cb..eb1738e760 100644 --- a/example/01_gemm/common.hpp +++ b/example/01_gemm/common.hpp @@ -21,6 +21,7 @@ #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/literals.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp" +#include "ck/library/reference_tensor_operation/gpu/reference_gemm.hpp" struct ProblemSize final { @@ -28,9 +29,9 @@ struct ProblemSize final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideC = 4096; + ck::index_t StrideA = 0; + ck::index_t StrideB = 0; + ck::index_t StrideC = 0; }; struct ProblemSizeStreamK final @@ -39,9 +40,9 @@ struct ProblemSizeStreamK final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideC = 4096; + ck::index_t StrideA = 0; + ck::index_t StrideB = 0; + ck::index_t StrideC = 0; ck::index_t NumSKBlocks = -1; }; @@ -51,9 +52,9 @@ struct ProblemSizeStreamK_universal final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideC = 4096; + ck::index_t StrideA = 0; + ck::index_t StrideB = 0; + ck::index_t StrideC = 0; ck::index_t Grid_size = -1; // defaults to max occupancy ck::index_t Streamk_sel = 1; // defaults to 1-tile SK @@ -65,9 +66,9 @@ struct ProblemSizeSplitK final ck::index_t N = 4096; ck::index_t K = 4096; - ck::index_t StrideA = 4096; - ck::index_t StrideB = 4096; - ck::index_t StrideC = 4096; + ck::index_t StrideA = 0; + ck::index_t StrideB = 0; + ck::index_t StrideC = 0; ck::index_t KBatch = 1; }; @@ -125,7 +126,7 @@ bool parse_cmd_args(int argc, } else { - std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl + std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl @@ -175,7 +176,7 @@ bool parse_cmd_args(int argc, else { std::cerr - << "arg1: verification (0=no, 1=yes)" << std::endl + << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl << "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC" << std::endl @@ -224,7 +225,7 @@ bool parse_cmd_args(int argc, } else { - std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl + std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl @@ -274,7 +275,7 @@ bool parse_cmd_args(int argc, } else { - std::cerr << "arg1: verification (0=no, 1=yes)" << std::endl + std::cerr << "arg1: verification (0=no, 1=CPU and GPU)" << std::endl << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)" << std::endl << "arg3: time kernel (0=no, 1=yes)" << std::endl diff --git a/example/01_gemm/gemm_dl_fp16.cpp b/example/01_gemm/gemm_dl_fp16.cpp index b5fecb9752..b9284b2783 100644 --- a/example/01_gemm/gemm_dl_fp16.cpp +++ b/example/01_gemm/gemm_dl_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -32,6 +32,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_dl_fp32.cpp b/example/01_gemm/gemm_dl_fp32.cpp index 212b72f2a6..1684213641 100644 --- a/example/01_gemm/gemm_dl_fp32.cpp +++ b/example/01_gemm/gemm_dl_fp32.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -32,6 +32,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_dl_int8.cpp b/example/01_gemm/gemm_dl_int8.cpp index 1840390aa9..1e64e9a0a3 100644 --- a/example/01_gemm/gemm_dl_int8.cpp +++ b/example/01_gemm/gemm_dl_int8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -32,6 +32,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDl using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_dpp_fp16.cpp b/example/01_gemm/gemm_dpp_fp16.cpp index 7a9e3f6186..30faf542dd 100644 --- a/example/01_gemm/gemm_dpp_fp16.cpp +++ b/example/01_gemm/gemm_dpp_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -34,6 +34,9 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmDpp using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device:: + ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_wmma_fp16.cpp b/example/01_gemm/gemm_wmma_fp16.cpp index f8afe8d6db..28ab878ac3 100644 --- a/example/01_gemm/gemm_wmma_fp16.cpp +++ b/example/01_gemm/gemm_wmma_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -68,6 +68,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmWmma_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_bf16.cpp b/example/01_gemm/gemm_xdl_bf16.cpp index 3cac55ef47..6cfff30dbd 100644 --- a/example/01_gemm/gemm_xdl_bf16.cpp +++ b/example/01_gemm/gemm_xdl_bf16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -33,6 +33,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceComputeType = float; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_bf16_rtn.cpp b/example/01_gemm/gemm_xdl_bf16_rtn.cpp index cc14dcb8eb..108c100cbd 100644 --- a/example/01_gemm/gemm_xdl_bf16_rtn.cpp +++ b/example/01_gemm/gemm_xdl_bf16_rtn.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -34,6 +34,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceComputeType = float; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp16.cpp b/example/01_gemm/gemm_xdl_fp16.cpp index 2338cdc9c1..07d51855d6 100644 --- a/example/01_gemm/gemm_xdl_fp16.cpp +++ b/example/01_gemm/gemm_xdl_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -47,6 +47,17 @@ using DeviceGemmInstance = DeviceGemmInstance1; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp16_fp8.cpp b/example/01_gemm/gemm_xdl_fp16_fp8.cpp index 979a200791..a996d034e6 100644 --- a/example/01_gemm/gemm_xdl_fp16_fp8.cpp +++ b/example/01_gemm/gemm_xdl_fp16_fp8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -42,6 +42,17 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp16_v2.cpp b/example/01_gemm/gemm_xdl_fp16_v2.cpp index eba0ea9d11..ecd3b7be5d 100644 --- a/example/01_gemm/gemm_xdl_fp16_v2.cpp +++ b/example/01_gemm/gemm_xdl_fp16_v2.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -46,6 +46,17 @@ using DeviceGemmInstance = using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp64.cpp b/example/01_gemm/gemm_xdl_fp64.cpp index 8361576299..5afb3d1554 100644 --- a/example/01_gemm/gemm_xdl_fp64.cpp +++ b/example/01_gemm/gemm_xdl_fp64.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -41,6 +41,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemmXdl BElementOp, CElementOp>; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp8.cpp b/example/01_gemm/gemm_xdl_fp8.cpp index fe41602301..3c75a44d21 100644 --- a/example/01_gemm/gemm_xdl_fp8.cpp +++ b/example/01_gemm/gemm_xdl_fp8.cpp @@ -37,6 +37,20 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceComputeType = float; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_fp8_bf8.cpp b/example/01_gemm/gemm_xdl_fp8_bf8.cpp index acc5fbc515..1dec165abd 100644 --- a/example/01_gemm/gemm_xdl_fp8_bf8.cpp +++ b/example/01_gemm/gemm_xdl_fp8_bf8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -44,6 +44,17 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_int8.cpp b/example/01_gemm/gemm_xdl_int8.cpp index cc03200b9d..3237f1a61c 100644 --- a/example/01_gemm/gemm_xdl_int8.cpp +++ b/example/01_gemm/gemm_xdl_int8.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -33,6 +33,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp index d29cb74cd6..62037f7740 100644 --- a/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp +++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include @@ -53,6 +53,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp b/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp index e99249389e..75971bdecf 100644 --- a/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp +++ b/example/01_gemm/gemm_xdl_lds_direct_load_fp32.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved. #include @@ -52,6 +52,17 @@ using DeviceGemmInstance = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_streamk.cpp b/example/01_gemm/gemm_xdl_streamk.cpp index 7d433b6145..5a02457daf 100644 --- a/example/01_gemm/gemm_xdl_streamk.cpp +++ b/example/01_gemm/gemm_xdl_streamk.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -44,6 +44,17 @@ using DeviceGemmInstance = DeviceGemmStreamK; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_streamk_example(argc, argv); } diff --git a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp index b0f963fee5..d8672f6a0c 100644 --- a/example/01_gemm/gemm_xdl_wavelet_fp16.cpp +++ b/example/01_gemm/gemm_xdl_wavelet_fp16.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "common.hpp" @@ -37,6 +37,17 @@ using DeviceGemmInstance = DeviceGemmInstance; using ReferenceGemmInstance = ck::tensor_operation::host:: ReferenceGemm; +using ReferenceGemmInstanceGPU = ck::tensor_operation::device::ReferenceGemm; + #include "run_gemm_example.inc" int main(int argc, char* argv[]) { return !run_gemm_example(argc, argv); } diff --git a/example/01_gemm/run_gemm_example.inc b/example/01_gemm/run_gemm_example.inc index a6f0d0bcfe..f66d2adc11 100644 --- a/example/01_gemm/run_gemm_example.inc +++ b/example/01_gemm/run_gemm_example.inc @@ -173,6 +173,7 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) Tensor c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); Tensor c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); + Tensor c_m_n_device_ref_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{})); std::cout << "a_m_k: " << a_m_k.mDesc << std::endl; std::cout << "b_k_n: " << b_k_n.mDesc << std::endl; @@ -193,6 +194,8 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize()); DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize()); DeviceMem c_m_n_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize()); + DeviceMem c_m_n_device_ref_buf(sizeof(CDataType) * + c_m_n_device_ref_result.mDesc.GetElementSpaceSize()); a_m_k_device_buf.ToDevice(a_m_k.mData.data()); b_k_n_device_buf.ToDevice(b_k_n.mData.data()); @@ -325,14 +328,18 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << gemm.GetTypeString() << std::endl; + bool pass = true; + if(config.do_verification) { + // CPU verification auto ref_gemm = ReferenceGemmInstance{}; auto ref_invoker = ref_gemm.MakeInvoker(); auto ref_argument = ref_gemm.MakeArgument( a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + std::cout << "Running verification on CPU." << std::endl; ref_invoker.Run(ref_argument); #ifdef BUILD_INT4_EXAMPLE @@ -346,15 +353,42 @@ bool run_gemm(const ProblemType& problem_size, const ExecutionConfig& config) #else c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); - return ck::utils::check_err(c_m_n_device_result, - c_m_n_host_result, - "Error: Incorrect results!", - get_rtol(), - get_atol()); + pass &= !ck::utils::check_err(c_m_n_device_result, + c_m_n_host_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); #endif + + // GPU verification + auto ref_gemm_gpu = ReferenceGemmInstanceGPU{}; + auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker(); + + auto ref_argument_gpu = ref_gemm_gpu.MakeArgument( + static_cast(a_m_k_device_buf.GetDeviceBuffer()), + static_cast(b_k_n_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_ref_buf.GetDeviceBuffer()), + M, + N, + K, + a_element_op, + b_element_op, + c_element_op); + + std::cout << "Running verification on GPU." << std::endl; + ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{}); + + c_m_n_device_ref_buf.FromDevice(c_m_n_device_ref_result.mData.data()); + c_m_n_device_buf.FromDevice(c_m_n_device_result.mData.data()); + + pass &= !ck::utils::check_err(c_m_n_device_result, + c_m_n_device_ref_result, + "Error: Incorrect results!", + get_rtol(), + get_atol()); } - return true; + return !pass; } bool run_gemm_example(int argc, char* argv[]) diff --git a/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp new file mode 100644 index 0000000000..639b5fe80f --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/gpu/reference_gemm.hpp @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/device_base.hpp" + +namespace ck { + +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + naive_gemm_kernel(const ADataType* __restrict__ p_a_grid, + const BDataType* __restrict__ p_b_grid, + CDataType* __restrict__ p_c_grid, + index_t m, + index_t n, + index_t k, + const AElementwiseOperation a_element_op, + const BElementwiseOperation b_element_op, + const CDEElementwiseOperation c_element_op) +{ + using RowMajor = ck::tensor_layout::gemm::RowMajor; + + const int row_idx = blockIdx.x * blockDim.x + threadIdx.x; + const int col_idx = blockIdx.y * blockDim.y + threadIdx.y; + + if(row_idx < m && col_idx < n) + { + + AccDataType v_acc = static_cast(0.0); + ComputeTypeA v_a = static_cast(0.0); + ComputeTypeB v_b = static_cast(0.0); + CDataType v_c = static_cast(0.0); + + for(int k_idx = 0; k_idx < k; ++k_idx) + { + // check input matrices layout + int element_idx_a = 0; + int element_idx_b = 0; + if constexpr(std::is_same_v) + { + element_idx_a = row_idx * k + k_idx; + } + else + { + element_idx_a = row_idx + m * k_idx; + } + if constexpr(std::is_same_v) + { + element_idx_b = k_idx * n + col_idx; + } + else + { + element_idx_b = k_idx + k * col_idx; + } + // apply a_element_op + a_element_op(v_a, p_a_grid[element_idx_a]); + // apply b_element_op + b_element_op(v_b, p_b_grid[element_idx_b]); + // multiply and accumulate + v_acc += static_cast(v_a) * static_cast(v_b); + } + // apply c_element_op + c_element_op(v_c, v_acc); + // check output matrix layout + int element_idx_c = 0; + if constexpr(std::is_same_v) + { + element_idx_c = row_idx * n + col_idx; + } + else + { + element_idx_c = row_idx + m * col_idx; + } + // prepare output + p_c_grid[element_idx_c] = v_c; + } +} + +} // namespace ck + +namespace ck { +namespace tensor_operation { +namespace device { + +template +struct ReferenceGemm : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + Argument(const void* p_a_grid, + const void* p_b_grid, + void* p_c_grid, + index_t m, + index_t n, + index_t k, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + : p_a_grid_{static_cast(p_a_grid)}, + p_b_grid_{static_cast(p_b_grid)}, + p_c_grid_{static_cast(p_c_grid)}, + m_{m}, + n_{n}, + k_{k}, + a_element_op_{a_element_op}, + b_element_op_{b_element_op}, + c_element_op_{c_element_op} + { + } + + const ADataType* p_a_grid_; + const BDataType* p_b_grid_; + CDataType* p_c_grid_; + + index_t m_; + index_t n_; + index_t k_; + + AElementwiseOperation a_element_op_; + BElementwiseOperation b_element_op_; + CElementwiseOperation c_element_op_; + }; + + // Invoker + struct Invoker : public device::BaseInvoker + { + using Argument = ReferenceGemm::Argument; + + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + int block_size = 16; + dim3 block_dim(block_size, block_size, 1); + dim3 grid_dim( + (arg.m_ + block_size - 1) / block_size, (arg.n_ + block_size - 1) / block_size, 1); + + auto launch_kernel = [&]() { + const auto kernel = naive_gemm_kernel; + + return launch_and_time_kernel(stream_config, + kernel, + grid_dim, + block_dim, + 0, + arg.p_a_grid_, + arg.p_b_grid_, + arg.p_c_grid_, + arg.m_, + arg.n_, + arg.k_, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_); + }; + + return launch_kernel(); + } + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + bool IsSupportedArgument(const device::BaseArgument*) override { return true; } + + static auto MakeArgument(const void* p_a_grid, + const void* p_b_grid, + void* p_c_grid, + index_t m, + index_t n, + index_t k, + AElementwiseOperation a_element_op, + BElementwiseOperation b_element_op, + CElementwiseOperation c_element_op) + { + return Argument{ + p_a_grid, p_b_grid, p_c_grid, m, n, k, a_element_op, b_element_op, c_element_op}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "Device Reference Gemm" + << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck From ceaed8e097cbba105e23f465c6226ab48e37a3a8 Mon Sep 17 00:00:00 2001 From: Christopher Millette <63608002+cgmillette@users.noreply.github.com> Date: Wed, 9 Oct 2024 01:41:35 -0600 Subject: [PATCH 4/6] Fixes small memory leak from missing hipEventDestroy (#1554) --- include/ck/host_utility/kernel_launch.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp index a616433ac9..962f89e479 100644 --- a/include/ck/host_utility/kernel_launch.hpp +++ b/include/ck/host_utility/kernel_launch.hpp @@ -66,6 +66,9 @@ float launch_and_time_kernel(const StreamConfig& stream_config, hip_check_error(hipEventElapsedTime(&total_time, start, stop)); + hip_check_error(hipEventDestroy(start)); + hip_check_error(hipEventDestroy(stop)); + return total_time / nrepeat; } else @@ -143,6 +146,9 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config, hip_check_error(hipEventElapsedTime(&total_time, start, stop)); + hip_check_error(hipEventDestroy(start)); + hip_check_error(hipEventDestroy(stop)); + return total_time / nrepeat; } else From cfac9497e28a7489d5cde5bf2b4f40691dd5659c Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 9 Oct 2024 10:18:05 -0700 Subject: [PATCH 5/6] remove gfx12 targets from daily builds with rocm6.2 (#1560) --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index e61fb71e8e..a79ed859f2 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1138,7 +1138,7 @@ pipeline { execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER="${build_compiler()}" \ -D CMAKE_BUILD_TYPE=Release \ - -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" \ + -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" \ -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """ } steps{ From 2e1165c1a73552dbacf08ccd351314ae95de14f7 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 9 Oct 2024 15:21:57 -0700 Subject: [PATCH 6/6] fix the target selection logic (#1561) --- CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6ad6307cb3..3f22bb4b61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -132,7 +132,11 @@ if(GPU_ARCHS) unset(GPU_TARGETS CACHE) unset(AMDGPU_TARGETS CACHE) endif() - +if(GPU_TARGETS) + set(USER_GPU_TARGETS 1) +else() + set(USER_GPU_TARGETS 0) +endif() find_package(hip) # No assumption that HIP kernels are launched with uniform block size for backward compatibility # SWDEV-413293 and https://reviews.llvm.org/D155213 @@ -162,7 +166,7 @@ endif() if(GPU_ARCHS) set(CK_GPU_TARGETS ${GPU_ARCHS}) else() - if(GPU_TARGETS) + if(USER_GPU_TARGETS) set(CK_GPU_TARGETS ${GPU_TARGETS}) endif() endif()