From f83a1c84c364f50ba9bc1b02f5386850f2d84e1a Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Wed, 26 Jul 2023 09:06:45 -0700 Subject: [PATCH] Disable DL kernels by default. (#816) [ROCm/composable_kernel commit: 9195435c77b17d554fb80086686746fa20106f76] --- CHANGELOG.md | 5 + CMakeLists.txt | 8 ++ Jenkinsfile | 5 +- README.md | 18 ++- .../gpu/batched_gemm_multi_d.hpp | 2 + .../tensor_operation_instance/gpu/gemm.hpp | 125 +++++++++++------- .../gpu/CMakeLists.txt | 8 +- .../gpu/batched_gemm_multi_d/CMakeLists.txt | 45 ++++--- .../gpu/gemm/CMakeLists.txt | 46 ++++--- profiler/src/CMakeLists.txt | 8 +- test/batched_gemm_multi_d/CMakeLists.txt | 4 +- 11 files changed, 177 insertions(+), 97 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3898b5ce2d..9001c2f4e7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,6 +12,11 @@ Full documentation for Composable Kernel is not yet available. - Improve proformance of normalization kernel ### Added +- Added new cmake flag "DL_KERNELS" must be set to "ON" in order to build the gemm_dl and batched_gemm_multi_d_dl instances. +- Added new cmake flag "DTYPES" which could be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instance of select data types. +- Added new cmake flag "INSTANCES_ONLY" which will only build CK library and instances without the tests, examples, or profiler. +- Added new feature: if GPU_TARGETS is not set on cmake command line, CK will be built for all targets supported by compiler. +- Added support on MI300A/MI300X. - Added support on NAVI3x. - Added user tutorial (#563). - Added more instances for irregular GEMM sizes (#560). diff --git a/CMakeLists.txt b/CMakeLists.txt index c0de76c580..514b98fde6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -30,6 +30,14 @@ else() add_definitions(-D__int8__ -D__fp8__ -D__fp16__ -D__fp32__ -D__fp64__ -D__bf16__) endif() +if(DL_KERNELS) + add_definitions(-DDL_KERNELS) +endif() + +if(INSTANCES_ONLY) + add_definitions(-DINSTANCES_ONLY) +endif() + enable_testing() set(ROCM_SYMLINK_LIBS OFF) diff --git a/Jenkinsfile b/Jenkinsfile index 764dc71220..6df9ea3889 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -509,8 +509,7 @@ def Build_CK(Map conf=[:]){ cmake_build(conf) dir("build"){ //run tests and examples - def nt = nthreads() - sh 'make -j${nt} check' + sh 'make -j check' if (navi_node == 0 ){ //we only need the ckProfiler to run the performance tests, so we pack and stash it //do not stash profiler on Navi nodes @@ -741,7 +740,7 @@ pipeline { } agent{ label rocmnode("navi21") } environment{ - setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" """ + setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx1030" -DDL_KERNELS=ON """ execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && cmake -D CMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" -DGPU_TARGETS="gfx1030" -D CMAKE_CXX_COMPILER="${build_compiler()}" .. && make -j """ } diff --git a/README.md b/README.md index fd6f7e37c2..c2b493db11 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ CK is released under the MIT license. [License File](/LICENSE) ```bash DOCKER_BUILDKIT=1 docker build -t ck:latest -f Dockerfile . ``` +Pre-built dockers are available from this public repo: +https://hub.docker.com/r/rocm/composable_kernel/tags ## Launch docker @@ -76,12 +78,26 @@ mkdir build && cd build cmake \ -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc \ --D CMAKE_CXX_FLAGS="-O3" \ -D CMAKE_BUILD_TYPE=Release \ -D GPU_TARGETS="gfx908;gfx90a" \ .. ``` +If GPU_TARGETS is not set on the cmake command line, CK will be built for all targets supported by the +current compiler. + + +Additional cmake flags can be used to significantly speed-up the build: + +INSTANCES_ONLY (by default is OFF) must be set to ON in order to build only the instances and library +while skipping all tests, examples, and profiler. This is useful for libraries that use CK as a dependency. + +DTYPES (by default not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances +of select data types only. Currently, building of int8 instances is taking a lot of time (the compiler fix is in the works). + +DL_KERNELS (by default is OFF) must be set to ON in order to build the gemm_dl and batched_gemm_multi_d_dl +instances. Those instances are only needed for the NAVI2x platforms. + ### Build examples and tests ```bash diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp index 3d50aae092..2d7aaf8729 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_multi_d.hpp @@ -14,6 +14,7 @@ #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#ifdef DL_KERNELS namespace ck { namespace tensor_operation { namespace device { @@ -335,3 +336,4 @@ struct DeviceOperationInstanceFactory>>& @@ -56,11 +57,11 @@ void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances( std::vector>>& instances); - +#endif +#if defined(__fp32__) && defined(DL_KERNELS) void add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances( std::vector>>& - instances); void add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances( @@ -77,7 +78,8 @@ void add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances( std::vector>>& instances); -#ifdef __int8__ +#endif +#if defined(__int8__) && defined(DL_KERNELS) void add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances( std::vector>>& @@ -117,7 +119,8 @@ void add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances( std::vector>>& instances); - +#endif +#ifdef __int8__ void add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances( std::vector>>& @@ -138,32 +141,12 @@ void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances( DeviceGemm>>& instances); #endif - +#ifdef __fp16__ void add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances( std::vector>>& instances); -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances( - std::vector>>& - instances); - void add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances( std::vector>>& @@ -184,26 +167,6 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances( DeviceGemm>>& instances); -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances( - std::vector>>& - instances); - void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances( std::vector>>& @@ -224,6 +187,49 @@ void add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances( DeviceGemm>>& instances); +#endif +#ifdef __bf16__ +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances( + std::vector>>& + instances); +#endif +#ifdef __fp32__ +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances( + std::vector>>& + instances); + void add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances( std::vector>>& @@ -243,7 +249,8 @@ void add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances( std::vector>>& instances); - +#endif +#ifdef __fp64__ void add_device_gemm_xdl_f64_f64_f64_km_kn_mn_instances( std::vector>>& instances); - +#endif template ) { add_device_gemm_xdl_f32_f32_f32_mk_kn_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f32_f32_f32_mk_kn_mn_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_f32_f32_f32_mk_nk_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f32_f32_f32_mk_nk_mn_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_f32_f32_f32_km_kn_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f32_f32_f32_km_kn_mn_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_f32_f32_f32_km_nk_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f32_f32_f32_km_nk_mn_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instances(op_ptrs); } } @@ -335,16 +350,20 @@ struct DeviceOperationInstanceFactory< is_same_v) { add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_f16_f16_f16_mk_nk_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(op_ptrs); add_device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances(op_ptrs); } @@ -352,16 +371,20 @@ struct DeviceOperationInstanceFactory< is_same_v) { add_device_gemm_xdl_f16_f16_f16_km_kn_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_f16_f16_f16_km_nk_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs); +#endif add_device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(op_ptrs); } } @@ -397,29 +420,37 @@ struct DeviceOperationInstanceFactory< is_same_v) { add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_i8_i8_i8_mk_kn_mn_instances(op_ptrs); add_device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instances(op_ptrs); +#endif } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_i8_i8_i8_mk_nk_mn_instances(op_ptrs); add_device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instances(op_ptrs); +#endif } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_i8_i8_i8_km_kn_mn_instances(op_ptrs); add_device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instances(op_ptrs); +#endif } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instances(op_ptrs); +#ifdef DL_KERNELS add_device_gemm_dl_i8_i8_i8_km_nk_mn_instances(op_ptrs); add_device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instances(op_ptrs); +#endif } } #endif diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 597aeaed8b..1d54a141b7 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -39,11 +39,15 @@ IF(IS_DIRECTORY "${subdir_path}") #message("int8 instance found!") set(add_inst 1) endif() - if(NOT "${cmake_instance}" MATCHES "DTYPES") + if(NOT "${cmake_instance}" MATCHES "DTYPES" OR NOT DEFINED DTYPES) #message("instance should be built for all types!") set(add_inst 1) endif() - if(add_inst EQUAL 1 OR NOT DEFINED DTYPES) + if("${cmake_instance}" MATCHES "ONLY DL_KERNELS" AND NOT DEFINED DL_KERNELS) + message("Found only dl instances, but DL_KERNELS is not set. Skipping.") + set(add_inst 0) + endif() + if(add_inst EQUAL 1) get_filename_component(target_dir ${subdir_path} NAME) add_subdirectory(${target_dir}) list(APPEND CK_DEVICE_INSTANCES $) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt index 8f4e3b6948..444c93b118 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt @@ -1,22 +1,25 @@ -set(BATCHED_GEMM_MULTID_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp) +# ONLY DL_KERNELS +if(DL_KERNELS) + set(BATCHED_GEMM_MULTID_INSTANCES) + if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp) + endif() + if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp) + list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp) + endif() + add_instance_library(device_batched_gemm_multi_d_instance ${BATCHED_GEMM_MULTID_INSTANCES}) endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp) -endif() -add_instance_library(device_batched_gemm_multi_d_instance ${BATCHED_GEMM_MULTID_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index 62cdae7cdf..b6f63f39d7 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -14,25 +14,29 @@ if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp) + if(DL_KERNELS) + list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp) + endif() endif() if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + if(DL_KERNELS) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp) + endif() list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp) @@ -67,14 +71,16 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp) endif() if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp) + if(DL_KERNELS) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp) + list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp) + endif() list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp) list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp) diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index d35c743939..d057dccb40 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -34,9 +34,11 @@ set(PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp profile_contraction_bilinear.cpp profile_contraction_scale.cpp - profile_batched_gemm_multi_d.cpp profile_grouped_conv_bwd_data.cpp ) +if(DL_KERNELS) + list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) +endif() set(PROFILER_EXECUTABLE ckProfiler) @@ -79,7 +81,9 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgel target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool_fwd_instance) -target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) +if(DL_KERNELS) + target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) +endif() target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler) diff --git a/test/batched_gemm_multi_d/CMakeLists.txt b/test/batched_gemm_multi_d/CMakeLists.txt index 45a306551f..f8ceb545c2 100644 --- a/test/batched_gemm_multi_d/CMakeLists.txt +++ b/test/batched_gemm_multi_d/CMakeLists.txt @@ -1,5 +1,7 @@ # TODO: Enable for gfx90a after complier fix -if(NOT GPU_TARGETS MATCHES "gfx90a") +if(DL_KERNELS) + if(NOT GPU_TARGETS MATCHES "gfx90a") add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp) target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance) + endif() endif()