From a03f2b367a3c1c88f1ce1931c05f4a4bf570113b Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Thu, 6 Mar 2025 21:45:31 -0800 Subject: [PATCH] RE-enable DL and DPP instances by default. (#1954) * enable DL and DPP instances by default * fix cmake logic [ROCm/composable_kernel commit: 43c90b523490d53798484c769c8437988c3a3b47] --- CHANGELOG.md | 1 + CMakeLists.txt | 6 ++++-- Jenkinsfile | 20 ++++++++++---------- README.md | 6 +++--- 4 files changed, 18 insertions(+), 15 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5d75fa64f5..cc98d35b16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ None * Removed support for gfx940 and gfx941 targets (#1944) * Replaced the raw buffer load/store intrinsics with Clang20 built-ins (#1876) +* DL and DPP kernels are now enabled by default. ### Known issues diff --git a/CMakeLists.txt b/CMakeLists.txt index 3be508382a..bb0c254e06 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,12 +94,14 @@ add_compile_options(-Wno-pass-failed) add_compile_options(-Wno-switch-default) add_compile_options(-Wno-unique-object-duplication) -if(DL_KERNELS) +if(NOT DISABLE_DL_KERNELS) add_definitions(-DDL_KERNELS) + set(DL_KERNELS "ON") set(CK_ENABLE_DL_KERNELS "ON") endif() -if(DPP_KERNELS) +if(NOT DISABLE_DPP_KERNELS) add_definitions(-DDPP_KERNELS) + set(DPP_KERNELS "ON") set(CK_ENABLE_DPP_KERNELS "ON") endif() option(CK_USE_CODEGEN "Enable codegen library" OFF) diff --git a/Jenkinsfile b/Jenkinsfile index a35b0e1892..51a406ac4d 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -199,8 +199,8 @@ def cmake_build(Map conf=[:]){ } else{ setup_args = ' -DBUILD_DEV=On' + setup_args } - if (params.DL_KERNELS){ - setup_args = setup_args + " -DDL_KERNELS=ON " + if (params.DISABLE_DL_KERNELS){ + setup_args = setup_args + " -DDISABLE_DL_KERNELS=ON " } if(build_type_debug){ @@ -717,10 +717,10 @@ def process_results(Map conf=[:]){ } //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;ROCMVERSION=6.3;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true 0 21 * * * % ROCMVERSION=6.3;hipTensor_test=true;RUN_CODEGEN_TESTS=true - 0 19 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true - 0 17 * * * % BUILD_DOCKER=true;DL_KERNELS=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true + 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true 0 15 * * * % BUILD_INSTANCES_ONLY=true;RUN_PERFORMANCE_TESTS=false;USE_SCCACHE=false 0 13 * * * % BUILD_LEGACY_OS=true''' : "" @@ -762,7 +762,7 @@ pipeline { defaultValue: false, description: "Select whether to run small set of performance tests (default) or full QA") booleanParam( - name: "DL_KERNELS", + name: "DISABLE_DL_KERNELS", defaultValue: false, description: "Select whether to build DL kernels (default: OFF)") booleanParam( @@ -861,7 +861,7 @@ pipeline { | grep -v 'build/' \ | xargs -n 1 -P 1 -I{} -t sh -c \'clang-format-12 -style=file {} | diff - {}\' && \ /cppcheck/build/bin/cppcheck ../* -v -j \$(nproc) -I ../include -I ../profiler/include -I ../library/include \ - -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 -D DL_KERNELS \ + -D CK_ENABLE_FP64 -D CK_ENABLE_FP32 -D CK_ENABLE_FP16 -D CK_ENABLE_FP8 -D CK_ENABLE_BF16 -D CK_ENABLE_BF8 -D CK_ENABLE_INT8 \ -D __gfx908__ -D __gfx90a__ -D __gfx942__ -D __gfx1030__ -D __gfx1100__ -D __gfx1101__ -D __gfx1102__ \ -U __gfx803__ -U __gfx900__ -U __gfx906__ -U CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 \ --file-filter=*.cpp --force --enable=all --output-file=ck_cppcheck.log" @@ -1164,7 +1164,7 @@ pipeline { } agent{ label rocmnode("gfx1030") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DCMAKE_CXX_FLAGS=" -O3 " """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ -DGPU_TARGETS="gfx1030" \ @@ -1184,7 +1184,7 @@ pipeline { } agent{ label rocmnode("gfx1101") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1101" -DCMAKE_CXX_FLAGS=" -O3 " """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ -DGPU_TARGETS="gfx1101" \ @@ -1204,7 +1204,7 @@ pipeline { } agent{ label rocmnode("gfx1201") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DDL_KERNELS=ON -DCMAKE_CXX_FLAGS=" -O3 " """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1201" -DCMAKE_CXX_FLAGS=" -O3 " """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \ cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \ -DGPU_TARGETS="gfx1201" \ diff --git a/README.md b/README.md index b9a6564173..c316a0a322 100644 --- a/README.md +++ b/README.md @@ -158,12 +158,12 @@ Additional cmake flags can be used to significantly speed-up the build: instances of select data types only. The main default data types are fp32 and fp16; you can safely skip other data types. -* `DL_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dl` or +* `DISABLE_DL_KERNELS` (default is OFF) must be set to ON in order not to build instances, such as `gemm_dl` or `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most other platforms have faster instances, such as `xdl` or `wmma`, available. -* `DPP_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dpp`. - These instances are useful on architectures like the NAVI2x, as most other platforms have faster instances, such as `xdl` or `wmma`, available. +* `DISABLE_DPP_KERNELS` (default is OFF) must be set to ON in order not to build instances, such as `gemm_dpp`. + These instances offer a slightly better performance of fp16 gemms on NAVI2x. But on other architectures faster alternatives are available. * `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances, such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on