From 254758813fa9c5e2ed8a03131a9c34f4e7c578b8 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Fri, 10 May 2024 09:41:39 -0700 Subject: [PATCH] Code clean-up (#1285) * code clean-up * remove the profiling output samples [ROCm/composable_kernel commit: 566b6480a2e6e1245033f256eca0dce097bd5d75] --- CMakeLists.txt | 6 +- Jenkinsfile | 39 ++++----- client_example/25_wrapper/wrapper_img2col.cpp | 1 - example/01_gemm/README.md | 14 ---- example/02_gemm_bilinear/README.md | 17 ---- example/04_gemm_add_add_fastgelu/README.md | 13 --- example/09_convnd_fwd/README.md | 14 ---- example/15_grouped_gemm/README.md | 16 ---- example/26_contraction/README.md | 11 --- .../30_grouped_conv_fwd_multiple_d/README.md | 12 --- example/46_gemm_add_multiply/README.md | 16 ---- include/ck/ck.hpp | 2 +- include/ck/host_utility/device_prop.hpp | 6 +- ...d_contraction_multiple_d_wmma_cshuffle.hpp | 2 +- .../device_batched_gemm_multiple_d_dl.hpp | 2 +- ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 4 +- .../device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp | 4 +- .../device/impl/device_fpAintB_gemm_wmma.hpp | 2 +- .../gpu/device/impl/device_gemm_dl.hpp | 4 +- .../gpu/device/impl/device_gemm_dpp.hpp | 2 +- .../device/impl/device_gemm_multiple_d_dl.hpp | 2 +- .../device_gemm_multiple_d_wmma_cshuffle.hpp | 2 +- .../gpu/device/impl/device_gemm_wmma.hpp | 2 +- ...conv_bwd_data_multiple_d_wmma_cshuffle.hpp | 2 +- ..._grouped_conv_bwd_weight_wmma_cshuffle.hpp | 2 +- ..._conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp | 2 +- ...ice_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp | 4 +- ...uped_conv_fwd_multiple_d_wmma_cshuffle.hpp | 2 +- .../device_grouped_gemm_multiple_d_dl.hpp | 2 +- ...e_grouped_query_attention_forward_wmma.hpp | 4 +- ...ice_multi_query_attention_forward_wmma.hpp | 4 +- .../gpu/grid/block_to_ctile_map.hpp | 2 +- .../tensor_operation/gpu/warp/wmma_gemm.hpp | 2 +- include/ck/utility/amd_xdlops.hpp | 2 +- include/ck/utility/type_convert.hpp | 2 +- profiler/README.md | 83 ------------------- script/test_convnd_fwd.sh | 2 +- .../test_grouped_convnd_bwd_weight.cpp | 8 +- 38 files changed, 57 insertions(+), 259 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e3113a31dd..c23746e7f3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -202,7 +202,7 @@ endif() option(USE_BITINT_EXTENSION_INT4 "Whether to enable clang's BitInt extension to provide int4 data type." OFF) -option(USE_OPT_NAVI3X "Whether to enable LDS cumode and Wavefront32 mode for NAVI3X silicons." OFF) +option(USE_OPT_GFX11 "Whether to enable LDS cumode and Wavefront32 mode for GFX11 silicons." OFF) if(USE_BITINT_EXTENSION_INT4) add_compile_definitions(CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4) @@ -210,10 +210,10 @@ if(USE_BITINT_EXTENSION_INT4) message("CK compiled with USE_BITINT_EXTENSION_INT4 set to ${USE_BITINT_EXTENSION_INT4}") endif() -if(USE_OPT_NAVI3X) +if(USE_OPT_GFX11) add_compile_options(-mcumode) add_compile_options(-mno-wavefrontsize64) - message("CK compiled with USE_OPT_NAVI3X set to ${USE_OPT_NAVI3X}") + message("CK compiled with USE_OPT_GFX11 set to ${USE_OPT_GFX11}") endif() ## Threads diff --git a/Jenkinsfile b/Jenkinsfile index d334549bb0..75800bfc94 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -515,30 +515,25 @@ def Build_CK(Map conf=[:]){ withDockerContainer(image: image, args: dockerOpts + ' -v=/var/jenkins/:/var/jenkins') { timeout(time: 24, unit: 'HOURS') { - //check whether running on Navi or MI300 node - def navi_node = 0 - def mi300_node = 0 + //check whether to run performance tests on this node + def do_perf_tests = 0 sh 'rocminfo | tee rocminfo.log' - if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') ){ - navi_node = 1 - echo "This is a Navi node" - } - if ( runShell('grep -n "gfx942" rocminfo.log') ){ - mi300_node = 1 - echo "This is MI300 node" + if ( runShell('grep -n "gfx1030" rocminfo.log') || runShell('grep -n "gfx1101" rocminfo.log') || runShell('grep -n "gfx942" rocminfo.log') ){ + do_perf_tests = 1 + echo "Stash profiler and run performance tests" } cmake_build(conf) dir("build"){ //run tests and examples sh 'make -j check' - if (params.RUN_PERFORMANCE_TESTS && navi_node == 0 && mi300_node == 0 ){ + if (params.RUN_PERFORMANCE_TESTS && do_perf_tests == 0 ){ //we only need the ckProfiler to run the performance tests, so we pack and stash it - //do not stash profiler on Navi or MI300 nodes + //do not stash profiler on nodes where we don't need to run performance tests sh 'tar -zcvf ckProfiler.tar.gz bin/ckProfiler' stash name: "ckProfiler.tar.gz" } - if (params.RUN_FULL_QA && mi300_node == 0 ){ - // build deb packages for all MI100/200/300 targets and prepare to export + if (params.RUN_FULL_QA && do_perf_tests == 0 ){ + // build deb packages for all gfx9 targets and prepare to export sh 'make -j package' archiveArtifacts artifacts: 'composablekernel-ckprofiler_*.deb' archiveArtifacts artifacts: 'composablekernel-tests_*.deb' @@ -546,7 +541,7 @@ def Build_CK(Map conf=[:]){ stash name: "ckprofiler_0.2.0_amd64.deb" } } - if (params.hipTensor_test && navi_node == 0 ){ + if (params.hipTensor_test && do_perf_tests == 0 ){ //build and test hipTensor sh """#!/bin/bash rm -rf "${params.hipTensor_branch}".zip @@ -814,7 +809,7 @@ pipeline { { parallel { - stage("Run Codegen Tests on MI200") + stage("Run Codegen Tests on gfx90a") { when { beforeAgent true @@ -865,7 +860,7 @@ pipeline { cleanWs() } } - stage("Build CK and run Tests on MI300") + stage("Build CK and run Tests on gfx942") { when { beforeAgent true @@ -885,7 +880,7 @@ pipeline { cleanWs() } } - stage("Build CK and run Tests on MI200") + stage("Build CK and run Tests on gfx90a") { when { beforeAgent true @@ -925,13 +920,13 @@ pipeline { cleanWs() } } - stage("Build CK and run Tests on Navi21") + stage("Build CK and run Tests on gfx1030") { when { beforeAgent true expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() } } - agent{ label rocmnode("navi21") } + agent{ label rocmnode("gfx1030") } environment{ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ @@ -945,13 +940,13 @@ pipeline { cleanWs() } } - stage("Build CK and run Tests on Navi32") + stage("Build CK and run Tests on gfx1101") { when { beforeAgent true expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() } } - agent{ label rocmnode("navi32") } + agent{ label rocmnode("gfx1101") } environment{ setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ diff --git a/client_example/25_wrapper/wrapper_img2col.cpp b/client_example/25_wrapper/wrapper_img2col.cpp index 2a4034d62f..ceccc5eb8f 100644 --- a/client_example/25_wrapper/wrapper_img2col.cpp +++ b/client_example/25_wrapper/wrapper_img2col.cpp @@ -181,4 +181,3 @@ int main(int argc, char* argv[]) {1, 1, 1} /*filter_dilations*/); return 0; } -// MI100 Perf: 0.255178 ms, 1698.9 GB/s, diff --git a/example/01_gemm/README.md b/example/01_gemm/README.md index 226783b03b..a09e69255f 100644 --- a/example/01_gemm/README.md +++ b/example/01_gemm/README.md @@ -7,17 +7,3 @@ #arg3: run kernel # of times (>1) ./bin/example_gemm_xdl 0 1 5 ``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -``` -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} -c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -arg.a_grid_desc_k0_m_k1_{512, 3840, 8} -arg.b_grid_desc_k0_n_k1_{512, 4096, 8} -arg.c_grid_desc_m_n_{ 3840, 4096} -launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 5 times... -Perf: 1.19685 ms, 107.657 TFlops, 78.8501 GB/s -``` diff --git a/example/02_gemm_bilinear/README.md b/example/02_gemm_bilinear/README.md index 9eb87e1e34..a407ce24f7 100644 --- a/example/02_gemm_bilinear/README.md +++ b/example/02_gemm_bilinear/README.md @@ -9,20 +9,3 @@ #arg11 to 12: alpha, beta ./bin/example_gemm_bilinear_xdl_fp16 1 1 1 3840 4096 4096 4096 4096 4096 4096 0.5 0.5 ``` -Result (MI100 @ 1502Mhz, 184.6TFlops peak FP16) -``` -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} -c0_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -arg.a_grid_desc_k0_m_k1_{512, 3840, 8} -arg.b_grid_desc_k0_n_k1_{512, 4096, 8} -arg.c0_grid_desc_m_n_{ 3840, 4096} -arg.c_grid_desc_m_n_{ 3840, 4096} -launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 1 times... -Perf: 0.936965 ms, 137.517 TFlops, 102.959 GB/s -error: 0 -max_diff: 0, 558.5, 558.5 -``` diff --git a/example/04_gemm_add_add_fastgelu/README.md b/example/04_gemm_add_add_fastgelu/README.md index 08a55fb9a3..7b0d003e59 100644 --- a/example/04_gemm_add_add_fastgelu/README.md +++ b/example/04_gemm_add_add_fastgelu/README.md @@ -8,16 +8,3 @@ #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE" ./bin/example_gemm_add_add_fastgelu_xdl_fp16 1 1 1 ``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -``` -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} -d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1} -d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -launch_and_time_kernel: grid_dim {480, 1, 1}, block_dim {256, 1, 1} -Warm up 1 time -Start running 10 times... -Perf: 1.26914 ms, 101.525 TFlops, 100.804 GB/s, DeviceGemmMultipleD_Xdl_CShuffle<256, 256, 128, 32, 8, 8> -``` diff --git a/example/09_convnd_fwd/README.md b/example/09_convnd_fwd/README.md index 9ab5fee549..22f90ea29a 100644 --- a/example/09_convnd_fwd/README.md +++ b/example/09_convnd_fwd/README.md @@ -16,17 +16,3 @@ # , (ie RightPy, RightPx for 2D) ./bin/example_convnd_fwd_xdl 0 1 100 ``` - -Result (MI100 @ 1087Mhz, 33.4TFlops peak FP32) -``` -input: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} -weights: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} -output: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} -arg.a_grid_desc_k0_m_k1_{432, 165888, 4} -arg.b_grid_desc_k0_n_k1_{432, 256, 4} -arg.c_grid_desc_m_n_{ 165888, 256} -launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 100 times... -Perf: 4.43736 ms, 33.0753 TFlops, 150.357 GB/s -``` diff --git a/example/15_grouped_gemm/README.md b/example/15_grouped_gemm/README.md index c83b23e08c..a2afe0f4b9 100644 --- a/example/15_grouped_gemm/README.md +++ b/example/15_grouped_gemm/README.md @@ -7,19 +7,3 @@ #arg3: run kernel # of times (>1) ./bin/example_grouped_gemm_xdl_fp16 0 1 5 ``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -``` -gemm[0] a_m_k: dim 2, lengths {256, 64}, strides {64, 1} b_k_n: dim 2, lengths {64, 128}, strides {1, 64} c_m_n: dim 2, lengths {256, 128}, strides {128, 1} -gemm[1] a_m_k: dim 2, lengths {512, 128}, strides {128, 1} b_k_n: dim 2, lengths {128, 256}, strides {1, 128} c_m_n: dim 2, lengths {512, 256}, strides {256, 1} -gemm[2] a_m_k: dim 2, lengths {768, 192}, strides {192, 1} b_k_n: dim 2, lengths {192, 384}, strides {1, 192} c_m_n: dim 2, lengths {768, 384}, strides {384, 1} -gemm[3] a_m_k: dim 2, lengths {1024, 256}, strides {256, 1} b_k_n: dim 2, lengths {256, 512}, strides {1, 256} c_m_n: dim 2, lengths {1024, 512}, strides {512, 1} -group: 0 arg.a_grid_desc_k0_m_k1_{8, 256, 8}, arg.b_grid_desc_k0_n_k1_{8, 128, 8}, arg.c_grid_desc_m_n_{ 256, 128} -group: 1 arg.a_grid_desc_k0_m_k1_{16, 512, 8}, arg.b_grid_desc_k0_n_k1_{16, 256, 8}, arg.c_grid_desc_m_n_{ 512, 256} -group: 2 arg.a_grid_desc_k0_m_k1_{24, 768, 8}, arg.b_grid_desc_k0_n_k1_{24, 384, 8}, arg.c_grid_desc_m_n_{ 768, 384} -group: 3 arg.a_grid_desc_k0_m_k1_{32, 1024, 8}, arg.b_grid_desc_k0_n_k1_{32, 512, 8}, arg.c_grid_desc_m_n_{ 1024, 512} -launch_and_time_kernel: grid_dim {30, 1, 1}, block_dim {256, 1, 1} -Warm up -Start running 5 times... -Perf: 0.037887 ms, 11.0706 TFlops, 90.8132 GB/s, DeviceGroupedGemmXdl<256, 256, 128, 4, 8, 32, 32, 4, 2> -``` diff --git a/example/26_contraction/README.md b/example/26_contraction/README.md index c88d93cf83..acbfa84df1 100644 --- a/example/26_contraction/README.md +++ b/example/26_contraction/README.md @@ -7,14 +7,3 @@ #arg3: time kernel (0=no, 1=yes) ./bin/example_contraction_bilinear_xdl_fp32 1 1 1 ``` - -Result (MI100 @ dynammic freq, 46TFlops peak FP32) -``` -a_ms_ks: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1} -b_ks_ns: dim 4, lengths {32, 64, 32, 64}, strides {128, 1, 524288, 4096} -c_ms_ns: dim 4, lengths {30, 128, 32, 64}, strides {524288, 4096, 128, 1} -launch_and_time_kernel: grid_dim {240, 1, 1}, block_dim {256, 1, 1} -Warm up 1 time -Start running 10 times... -Perf: 0.843286 ms, 38.1985 TFlops, 94.5014 GB/s, DeviceContractionMultipleD_Xdl_CShuffle<256, 256, 128, 16, 4, 4> -``` diff --git a/example/30_grouped_conv_fwd_multiple_d/README.md b/example/30_grouped_conv_fwd_multiple_d/README.md index 7a0cb2d0e4..1165634e1a 100644 --- a/example/30_grouped_conv_fwd_multiple_d/README.md +++ b/example/30_grouped_conv_fwd_multiple_d/README.md @@ -16,15 +16,3 @@ Following arguments (depending on number of spatial dims): ./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1 ``` -Result (MI100) -``` -in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192} -wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192} -bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0} -residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0} -out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256} -launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} -Warm up 1 time -Start running 10 times... -Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<256, 128, 256, 16, Default> -``` diff --git a/example/46_gemm_add_multiply/README.md b/example/46_gemm_add_multiply/README.md index ee5cdee365..e2de4696f3 100644 --- a/example/46_gemm_add_multiply/README.md +++ b/example/46_gemm_add_multiply/README.md @@ -8,19 +8,3 @@ #arg4 to 11: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD0, StrideD1, StrideE" ./bin/example_gemm_add_multiply_dl_fp16 1 1 1 ``` - -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -``` -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {4096, 1} -d0_m_n: dim 2, lengths {3840, 4096}, strides {0, 1} -d1_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -e_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -arg.a_grid_desc_k0_m0_m1_k1_{2048, 3840, 2} -arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2} -arg.e_grid_desc_m_n_{ 3840, 4096} -launch_and_time_kernel: grid_dim {960, 1, 1}, block_dim {256, 1, 1} -Warm up 1 time -Start running 10 times... -Perf: 3.99904 ms, 32.22 TFlops, 31.9913 GB/s, DeviceGemmMultipleD_Dl<256, 128, 128, 16, 2, 4, 4, 1> -``` diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp index c8025f53c7..55f5620616 100644 --- a/include/ck/ck.hpp +++ b/include/ck/ck.hpp @@ -236,7 +236,7 @@ CK_DECLARE_ENV_VAR_BOOL(CK_LOGGING) #ifndef CK_WORKAROUND_DENORM_FIX #define CK_WORKAROUND_DENORM_FIX 0 #else -// enable only on MI200 +// enable only for gfx90a #define CK_WORKAROUND_DENORM_FIX = CK_WORKAROUND_DENORM_FIX && defined(__gfx90a__) #endif // CK_WORKAROUND_DENORM_FIX diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp index 13e5268752..116bb3ea02 100644 --- a/include/ck/host_utility/device_prop.hpp +++ b/include/ck/host_utility/device_prop.hpp @@ -65,20 +65,20 @@ inline bool is_lds_direct_load_supported() ck::get_device_name() == "gfx941" || ck::get_device_name() == "gfx942"; } -inline bool is_navi1_supported() +inline bool is_gfx101_supported() { return ck::get_device_name() == "gfx1010" || ck::get_device_name() == "gfx1011" || ck::get_device_name() == "gfx1012"; } -inline bool is_navi2_supported() +inline bool is_gfx103_supported() { return ck::get_device_name() == "gfx1030" || ck::get_device_name() == "gfx1031" || ck::get_device_name() == "gfx1032" || ck::get_device_name() == "gfx1034" || ck::get_device_name() == "gfx1035" || ck::get_device_name() == "gfx1036"; } -inline bool is_navi3_supported() +inline bool is_gfx11_supported() { return ck::get_device_name() == "gfx1100" || ck::get_device_name() == "gfx1101" || ck::get_device_name() == "gfx1102" || ck::get_device_name() == "gfx1103"; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp index d35645c068..a157595593 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp @@ -829,7 +829,7 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle static bool IsSupportedArgument(const Argument& arg) { - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp index b01e029c03..8fd14afc0c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_dl.hpp @@ -648,7 +648,7 @@ struct DeviceBatchedGemmMultipleD_Dl : public DeviceBatchedGemmMultiD || is_same_v)) { @@ -1435,7 +1435,7 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle #if 0 static bool IsSupportedArgument(const Argument& arg) { - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp index 5d9f8a178c..149aca7e3e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_convnd_bwd_data_nwc_kxc_nwk_dl.hpp @@ -1392,8 +1392,8 @@ struct DeviceConvNdBwdDataNwcKxcNwk_Dl static bool IsSupportedArgument(const Argument& arg) { // check device - if(!(ck::get_device_name() == "gfx906" || ck::is_navi2_supported() || - ck::is_navi3_supported())) + if(!(ck::get_device_name() == "gfx906" || ck::is_gfx103_supported() || + ck::is_gfx11_supported())) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp index 4385d64c19..bf96324d00 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_fpAintB_gemm_wmma.hpp @@ -509,7 +509,7 @@ struct DeviceFpAintBGemm_Wmma_CShuffle : public DeviceGemm_dequantB || is_same_v || is_same_v)) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp index 515892142e..d3af5e63d3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_dl.hpp @@ -535,8 +535,8 @@ struct DeviceGemmDl : public DeviceGemm || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp index a7f2305291..93ab8a7e1d 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_wmma.hpp @@ -443,7 +443,7 @@ struct DeviceGemmWmma_CShuffle : public DeviceGemm || is_same_v || is_same_v)) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp index b0e0e6da76..6f74838fba 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_wmma_cshuffle.hpp @@ -629,7 +629,7 @@ struct DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle static bool IsSupportedArgument(const Argument& arg) { // check device - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp index b9436c21a4..211185dfb0 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_wmma_cshuffle.hpp @@ -692,7 +692,7 @@ struct DeviceGroupedConvBwdWeight_Wmma_CShuffle static bool IsSupportedArgument(const Argument& arg) { // check device - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp index c3023301f3..7cfbd8a8f3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp @@ -666,7 +666,7 @@ struct DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK // check device if(!(ck::get_device_name() == "gfx906" || ck::is_xdl_supported() || - ck::is_navi2_supported() || ck::is_navi3_supported())) + ck::is_gfx103_supported() || ck::is_gfx11_supported())) { return false; } diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp index d731e5ddac..6a4d97d7d2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_nhwc_kyxc_nhwk.hpp @@ -601,8 +601,8 @@ struct DeviceGroupedConvFwdDl_NHWC_KYXC_NHWK : public DeviceGroupedConvFwd || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp index 37c5b5c912..a88c7b4fb7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp @@ -673,7 +673,7 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm || is_same_v)) { @@ -958,7 +958,7 @@ struct DeviceGroupedQueryAttentionForward_Wmma #if 0 static bool IsSupportedArgument(const Argument& arg) { - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp index b7551e78a2..4e14ed3a51 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp @@ -594,7 +594,7 @@ struct DeviceMultiQueryAttentionForward_Wmma static bool IsSupportedArgument(const RawArg& arg) { - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { if constexpr(!(is_same_v || is_same_v)) { @@ -950,7 +950,7 @@ struct DeviceMultiQueryAttentionForward_Wmma #if 0 static bool IsSupportedArgument(const Argument& arg) { - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { if constexpr(!(is_same_v || is_same_v)) { diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp index d92f504d52..84b00fcbd6 100644 --- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp +++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp @@ -260,7 +260,7 @@ struct BlockToCTileMap_M00_N0_M01Adapt : BlockToCTileMap_M00_N0_M01Adapt struct BlockToCTileMap_Grouped_M00_N0_M01Adapt diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp index 70fbcec10f..565195f53e 100644 --- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp +++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp @@ -95,7 +95,7 @@ struct wmma_type{}; - // * Fixed in Navi3x, Will be wave mode dependent on Navi4x + // * Fixed on gfx11, Will be wave mode dependent for future architectures static constexpr index_t num_src_a_vgprs_per_wave = m_per_wmma * src_a_data_size / 4; static constexpr index_t num_src_b_vgprs_per_wave = n_per_wmma * src_b_data_size / 4; // * num_acc_vgprs_per_wave alone M direction diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp index 0ee52b9570..d8ccb2ea76 100644 --- a/include/ck/utility/amd_xdlops.hpp +++ b/include/ck/utility/amd_xdlops.hpp @@ -4,7 +4,7 @@ #pragma once namespace ck { -// Define the common macro for MI300 models +// Define the common macro for gfx94x models #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #define __gfx94__ #endif diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp index be74b1fdc1..382b9c5551 100644 --- a/include/ck/utility/type_convert.hpp +++ b/include/ck/utility/type_convert.hpp @@ -8,7 +8,7 @@ #include "ck/utility/random_gen.hpp" namespace ck { -// Define the common macro for MI300 models +// Define the common macro for gfx94x models #if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) #define __gfx94__ #endif diff --git a/profiler/README.md b/profiler/README.md index a4daefba92..10febcabdc 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -13,15 +13,6 @@ ./bin/ckProfiler gemm 1 1 1 1 0 5 3840 4096 4096 4096 4096 4096 ``` -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) -```bash -a_m_k: dim 2, lengths {3840, 4096}, strides {4096, 1} -b_k_n: dim 2, lengths {4096, 4096}, strides {1, 4096} -c_m_n: dim 2, lengths {3840, 4096}, strides {4096, 1} -.... -Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s -``` - ## Profile 2D forward convolution kernels ```bash #arg1: tensor operation (conv=Convolution) @@ -37,15 +28,6 @@ Best Perf: 1.1933 ms, 107.977 TFlops, 79.0848 GB/s ################ op datatype in_layout wei_layout out_layout verify init log repeat N__ K___ C___ Y X Hi__ Wi__ Strides Dilations LeftPads RightPads ./bin/ckProfiler conv2d_fwd 1 1 1 1 1 1 0 5 128 256 192 3 3 71 71 2 2 1 1 1 1 1 1 ``` -Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16) - -```bash -in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192} -wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192} -out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256} -.... -Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s -``` ## Profile contraction kernels ```bash @@ -71,16 +53,6 @@ Best Perf: 1.42509 ms, 102.988 TFlops, 234.086 GB/s ./bin/ckProfiler contraction_bilinear 0 0 2 1 0 0 0 1 1.0 1.0 128 128 128 128 128 128 ``` -Result (MI100) -```bash -a_m_k: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1} -b_k_n: dim 4, lengths {128, 128, 128, 128}, strides {128, 1, 2097152, 16384} -d_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1} -e_m_n: dim 4, lengths {128, 128, 128, 128}, strides {2097152, 16384, 128, 1} -.... -Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s -``` - ## Profile batched gemm multiple D kernels ```bash #arg1: tensor operation (batched_gemm_multi_d=Batched GEMM multi D); @@ -99,14 +71,6 @@ Best Perf: 211.405 ms, 41.6077 TFlops, 15.2372 GB/s ./bin/ckProfiler batched_gemm_multi_d 0 1 0 0 0 1 4096 4096 4096 4096 4096 4096 16777216 16777216 16777216 16 ``` -Result (Radeon RX 6800 XT) -```bash -arg.a_grid_desc_k0_m0_m1_k1_{2048, 4096, 2} -arg.b_grid_desc_k0_n0_n1_k1_{2048, 4096, 2} -arg.e_grid_desc_m_n_{ 4096, 4096} -.... -Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s -``` ## Profile grouped convolution backward data kernels ```bash # arg1: tensor operation (grouped_conv_bwd_data: Grouped Convolution Backward Data) @@ -134,20 +98,6 @@ Best Perf: 58.0306 ms, 37.8942 TFlops, 27.7545 GB/s ``` -Result (MI100, FP16, GNHWC_GKYXC_GNHWK) - -```bash -out: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192} -wei: dim 5, lengths {32, 192, 192, 3, 3}, strides {331776, 1728, 1, 576, 192} -in: dim 5, lengths {32, 4, 192, 28, 28}, strides {602112, 150528, 1, 5376, 192} -.... -Best configuration parameters: -name: DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1<256, 128, 256, 32, 8, 2, Default, 32, 32, 2, 4, 8, 4, 1, 1> -avg_time: 0.768321 -tflops: 86.6679 -GB/s: 127.947 -``` - ## Profile grouped convolution backward weight kernels ```bash # arg1: tensor operation (grouped_conv_bwd_weight: Grouped Convolution Backward Weight) @@ -179,19 +129,6 @@ GB/s: 127.947 ``` -Result (MI100, FP16, GNHWC_GKYXC_GNHWK) - -```bash -input: dim 5, lengths {32, 512, 1024, 28, 28}, strides {411041792, 802816, 1, 28672, 1024} -weight: dim 5, lengths {32, 512, 1024, 3, 3}, strides {4718592, 9216, 1, 3072, 1024} -output: dim 5, lengths {32, 512, 512, 26, 26}, strides {177209344, 346112, 1, 13312, 512} -.... -Best configuration parameters: -name: DeviceGroupedConvBwdWeight_Xdl_CShuffle<256, 256, 128, 4, Default, 8, 4, 2, 8, 4, 8, 2, 1, 1, 8> -avg_time: 68.5216 -tflops: 95.337 -GB/s: 69.2301 -``` Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time. ## Profile image to column/column to image kernels @@ -224,17 +161,6 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate ``` -Result (MI210, FP32, NHWC) - -```bash -input: dim 5, lengths {1, 256, 512, 28, 28}, strides {102760448, 401408, 1, 14336, 512} -output: dim 2, lengths {173056, 4608}, strides {4608, 1} -.... -Best configuration parameters: -name: DeviceImageToColumn<128, 32, 64, 4> -avg_time: 3.12326 -GB/s: 2042.59 -``` Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time. ## Profile Permute scale kernels @@ -254,12 +180,3 @@ Note: Column to image kernel adds to the output memory, this will cause output b ################ op datatype verify init log time dim0 dim1 dim2 in_stride0 in_stride1 in_stride2 out_stride0 out_stride1 out_stride2 ./bin/ckProfiler permute_scale 0 1 1 0 1 64 64 64 4096 64 1 1 64 4096 ``` - -Result (MI100, FP32) - -```bash -A: dim 3, lengths {64, 64, 64}, strides {4096, 64, 1} -B: dim 3, lengths {64, 64, 64}, strides {1, 64, 4096} -.... -Best perf = 0.0146878 ms, 142.782 GB/s, DeviceElementwiseNormalizationImpl<3, 2> -``` diff --git a/script/test_convnd_fwd.sh b/script/test_convnd_fwd.sh index 1bd7a6b5d7..8bd2c2fc33 100644 --- a/script/test_convnd_fwd.sh +++ b/script/test_convnd_fwd.sh @@ -65,7 +65,7 @@ set -- "${POSITIONAL[@]}" # restore positional parameters # NUMACTL="numactl --cpunodebind=1 --membind=1" NUMACTL= # ENV_CONF= -GPU=mi100 +GPU=gfx908 PROF_ITER_COUNT=10000 LOG_DIR_PATH=../log/${LOG_DIR} set -x diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp index d100fb1077..1c8082645c 100644 --- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp +++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight.cpp @@ -55,14 +55,14 @@ class TestGroupedConvndBwdWeight : public ::testing::Test } } - if(ck::is_navi3_supported()) + if(ck::is_gfx11_supported()) { - // on navi3x only support for 3d is implemented + // on gfx11 only support for 3d is implemented if constexpr(NDimSpatial{} != 3) { return true; } - // on navi3x only support for i8 and fp16 is implemented + // on gfx11 only support for i8 and fp16 is implemented if constexpr(!((std::is_same_v && std::is_same_v && std::is_same_v) || @@ -80,7 +80,7 @@ class TestGroupedConvndBwdWeight : public ::testing::Test } else { - // support for i8 is only implemented on navi3x + // support for i8 is only implemented on gfx11 if constexpr(std::is_same_v && std::is_same_v && std::is_same_v) {