diff --git a/CMakeLists.txt b/CMakeLists.txt index 6d4176735f..191aad8721 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,6 +97,10 @@ if(DL_KERNELS) add_definitions(-DDL_KERNELS) set(CK_ENABLE_DL_KERNELS "ON") endif() +if(DPP_KERNELS) + add_definitions(-DDPP_KERNELS) + set(CK_ENABLE_DPP_KERNELS "ON") +endif() option(CK_USE_CODEGEN "Enable codegen library" OFF) if(CK_USE_CODEGEN) add_definitions(-DCK_USE_CODEGEN) diff --git a/README.md b/README.md index c0872aa567..719c008c2b 100644 --- a/README.md +++ b/README.md @@ -153,6 +153,9 @@ Additional cmake flags can be used to significantly speed-up the build: `batched_gemm_multi_d_dl`. These instances are useful on architectures like the NAVI2x, as most other platforms have faster instances, such as `xdl` or `wmma`, available. +* `DPP_KERNELS` (default is OFF) must be set to ON in order to build instances, such as `gemm_dpp`. + These instances are useful on architectures like the NAVI2x, as most other platforms have faster instances, such as `xdl` or `wmma`, available. + * `CK_USE_FP8_ON_UNSUPPORTED_ARCH` (default is OFF) must be set to ON in order to build instances, such as `gemm_universal`, `gemm_universal_streamk` and `gemm_multiply_multiply` for fp8 data type for GPU targets which do not have native support for fp8 data type, such as gfx908 or gfx90a. These instances are useful on architectures like the MI100/MI200 for the functional support only. diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 72759916af..f5ae4145e7 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -54,9 +54,9 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME) list(REMOVE_ITEM FILE_NAME "${source}") endif() endforeach() - #Do not build any DPP examples if DL_KERNELS not set + #Do not build any DPP examples if DPP_KERNELS not set foreach(source IN LISTS FILE_NAME) - if(NOT DEFINED DL_KERNELS AND source MATCHES "_dpp") + if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp") message("removing dpp example ${source} ") list(REMOVE_ITEM FILE_NAME "${source}") endif() diff --git a/include/ck/config.h.in b/include/ck/config.h.in index 2c37300e9b..3a590c676f 100644 --- a/include/ck/config.h.in +++ b/include/ck/config.h.in @@ -97,6 +97,10 @@ #cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@ #endif +#ifndef CK_ENABLE_DPP_KERNELS +#cmakedefine CK_ENABLE_DPP_KERNELS @CK_ENABLE_DPP_KERNELS@ +#endif + // // CK kernels which support XDL (MI series) // diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp index 3b3baf6978..2dc2061015 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp @@ -15,6 +15,9 @@ #ifdef DL_KERNELS #include "gemm_dl.inc" #endif +#ifdef DPP_KERNELS +#include "gemm_dpp.inc" +#endif #ifdef CK_USE_WMMA #include "gemm_wmma.inc" #endif @@ -92,32 +95,24 @@ struct DeviceOperationInstanceFactory< { add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs); add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs); - add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs); } } #endif @@ -153,6 +148,39 @@ struct DeviceOperationInstanceFactory< #endif #endif // DL_KERNELS +#ifdef DPP_KERNELS +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs); + add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs); + add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs); + add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs); + add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs); + } + } +#endif +#endif // DPP_KERNELS + #ifdef CK_USE_WMMA #ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc index 44a11f6284..0fee4190a6 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc @@ -28,16 +28,6 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances( DeviceGemm>>& instances); -void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances( - std::vector>>& - instances); - void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances( std::vector>>& @@ -48,16 +38,6 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances( DeviceGemm>>& instances); -void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances( - std::vector>>& - instances); - void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances( std::vector>>& @@ -68,16 +48,6 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances( DeviceGemm>>& instances); -void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances( - std::vector>>& - instances); - -void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances( - std::vector>>& - instances); - void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances( std::vector>>& diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_dpp.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_dpp.inc new file mode 100644 index 0000000000..b43552673d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_dpp.inc @@ -0,0 +1,65 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/device_gemm.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +#if defined(CK_ENABLE_FP16) +void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances( + std::vector>>& + instances); + +void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances( + std::vector>>& + instances); + +void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances( + std::vector>>& + instances); + +void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances( + std::vector>>& + instances); + +void add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances( + std::vector>>& + instances); +#endif + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index d72281f437..90437478c1 100755 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -39,6 +39,13 @@ function(add_instance_library INSTANCE_NAME) set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) + # Do not build DPP instances if DPP_KERNELS macro is not set + foreach(source IN LISTS ARGN) + if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp") + message("removing dpp instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() # Do not build DL instances if DL_KERNELS macro is not set foreach(source IN LISTS ARGN) if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 498a20dc55..c499482bd8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -43,6 +43,12 @@ function(add_test_executable TEST_NAME) set(TEST_TARGETS ${SUPPORTED_GPU_TARGETS}) + foreach(source IN LISTS ARGN) + if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp") + message("removing dpp test ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() foreach(source IN LISTS ARGN) if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") message("removing dl test ${source} ")