diff --git a/CMakeLists.txt b/CMakeLists.txt index 32defea756..95ef2df7e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,5 @@ cmake_minimum_required(VERSION 3.14) +cmake_policy(SET CMP0140 NEW) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE @@ -383,31 +384,31 @@ IF(IS_DIRECTORY "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu set(cmake_instance) file(READ "${PROJECT_SOURCE_DIR}/library/src/tensor_operation_instance/gpu/${subdir_path}/CMakeLists.txt" cmake_instance) set(add_inst 0) - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8") + if(("${cmake_instance}" MATCHES "fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8") #message("fp8 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf8\" " AND DTYPES MATCHES "bf8") + if(("${cmake_instance}" MATCHES "bf8" OR "${cmake_instance}" MATCHES "_b8") AND DTYPES MATCHES "bf8") #message("bf8 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16") + if(("${cmake_instance}" MATCHES "fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16") #message("fp16 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32") + if(("${cmake_instance}" MATCHES "fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32") #message("fp32 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64") + if(("${cmake_instance}" MATCHES "fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64") #message("fp64 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16") + if(("${cmake_instance}" MATCHES "bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16") #message("bf16 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8") + if(("${cmake_instance}" MATCHES "int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8") #message("int8 instance found!") set(add_inst 1) endif() diff --git a/example/01_gemm/CMakeLists.txt b/example/01_gemm/CMakeLists.txt index 5574d09001..2351d3f105 100644 --- a/example/01_gemm/CMakeLists.txt +++ b/example/01_gemm/CMakeLists.txt @@ -1,51 +1,54 @@ -if(DL_KERNELS) - add_custom_target(example_gemm_dl) +add_custom_target(example_gemm_dl) - add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp) - add_dependencies(example_gemm_dl example_gemm_dl_fp32) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp) +add_example_executable(example_gemm_dl_fp32 gemm_dl_fp32.cpp) +if(result EQUAL 0) + add_dependencies(example_gemm_dl example_gemm_dl_fp32) +endif() +add_example_executable(example_gemm_dl_fp16 gemm_dl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_dl example_gemm_dl_fp16) - add_example_executable(example_gemm_dpp_fp16 gemm_dpp_fp16.cpp) - endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp) +endif() +add_example_executable(example_gemm_dpp_fp16 gemm_dpp_fp16.cpp) +add_example_executable(example_gemm_dl_int8 gemm_dl_int8.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_dl example_gemm_dl_int8) - endif() - - if(USE_BITINT_EXTENSION_INT4) +endif() +if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_gemm_dl_int4 gemm_dl_int4.cpp) add_dependencies(example_gemm_dl example_gemm_dl_int4) - endif(USE_BITINT_EXTENSION_INT4) -endif() +endif(USE_BITINT_EXTENSION_INT4) add_custom_target(example_gemm_xdl) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp) - add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp) - add_dependencies(example_gemm_xdl example_gemm_xdl_fp16) - add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16) - add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp) - add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16) - - if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") +add_example_executable(example_gemm_xdl_fp16 gemm_xdl_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_gemm_xdl example_gemm_xdl_fp16) +endif() +add_example_executable(example_gemm_xdl_wavelet_fp16 gemm_xdl_wavelet_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_gemm_xdl example_gemm_xdl_wavelet_fp16) +endif() +add_example_executable(example_gemm_xdl_skip_b_lds_fp16 gemm_xdl_skip_b_lds_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_gemm_xdl example_gemm_xdl_skip_b_lds_fp16) +endif() +if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") add_custom_target(example_gemm_wmma) add_example_executable(example_gemm_wmma_fp16 gemm_wmma_fp16.cpp) - add_dependencies(example_gemm_wmma example_gemm_wmma_fp16) - endif() - + if(result EQUAL 0) + add_dependencies(example_gemm_wmma example_gemm_wmma_fp16) + endif() endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp) +add_example_executable(example_gemm_xdl_bf16 gemm_xdl_bf16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_xdl example_gemm_xdl_bf16) add_example_executable(example_gemm_xdl_bf16_rtn gemm_xdl_bf16_rtn.cpp) add_dependencies(example_gemm_xdl example_gemm_xdl_bf16_rtn) endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp) +add_example_executable(example_gemm_xdl_int8 gemm_xdl_int8.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_xdl example_gemm_xdl_int8) endif() @@ -54,22 +57,23 @@ if(USE_BITINT_EXTENSION_INT4) add_dependencies(example_gemm_xdl example_gemm_xdl_int4) endif(USE_BITINT_EXTENSION_INT4) -if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed - add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp) +# FIXME: re-enable this exampe as test when SWDEV-335738 is fixed +add_example_executable_no_testing(example_gemm_xdl_fp64 gemm_xdl_fp64.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_xdl example_gemm_xdl_fp64) endif() add_example_executable(example_gemm_xdl_streamk gemm_xdl_streamk.cpp) -if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES) - if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942") - add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp) + +if(GPU_TARGETS MATCHES "gfx940" OR GPU_TARGETS MATCHES "gfx941" OR GPU_TARGETS MATCHES "gfx942") + add_example_executable(example_gemm_xdl_f8 gemm_xdl_f8.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_xdl example_gemm_xdl_f8) endif() endif() -if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_xdl_fp16_f8 gemm_xdl_fp16_f8.cpp) +add_example_executable(example_gemm_xdl_fp16_f8 gemm_xdl_fp16_f8.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_xdl example_gemm_xdl_fp16_f8) endif() diff --git a/example/02_gemm_bilinear/CMakeLists.txt b/example/02_gemm_bilinear/CMakeLists.txt index 52e6330523..d82c42d5a9 100644 --- a/example/02_gemm_bilinear/CMakeLists.txt +++ b/example/02_gemm_bilinear/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND gpu_list1 gfx1100 gfx1101 gfx1102) list(APPEND gpu_list2 gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) @@ -19,4 +18,3 @@ foreach(gpu IN LISTS GPU_TARGETS) set(target 1) endif() endforeach() -endif() diff --git a/example/03_gemm_bias_relu/CMakeLists.txt b/example/03_gemm_bias_relu/CMakeLists.txt index a247a052cb..2f5cba924d 100644 --- a/example/03_gemm_bias_relu/CMakeLists.txt +++ b/example/03_gemm_bias_relu/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) @@ -7,4 +6,3 @@ foreach(gpu IN LISTS GPU_TARGETS) set(target 1) endif() endforeach() -endif() diff --git a/example/04_gemm_add_add_fastgelu/CMakeLists.txt b/example/04_gemm_add_add_fastgelu/CMakeLists.txt index 15ec62c89f..9fe833dda0 100644 --- a/example/04_gemm_add_add_fastgelu/CMakeLists.txt +++ b/example/04_gemm_add_add_fastgelu/CMakeLists.txt @@ -3,24 +3,24 @@ set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) add_custom_target(example_gemm_add_add_fastgelu_xdl) - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp) + add_example_executable(example_gemm_add_add_fastgelu_xdl_bf16 gemm_add_add_fastgelu_xdl_bf16.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_bf16) endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp) + add_example_executable(example_gemm_add_add_fastgelu_xdl_fp16 gemm_add_add_fastgelu_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp16) endif() - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp) + add_example_executable(example_gemm_add_add_fastgelu_xdl_fp32 gemm_add_add_fastgelu_xdl_fp32.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_fp32) endif() if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_gemm_add_add_fastgelu_xdl_int4 gemm_add_add_fastgelu_xdl_int4.cpp) add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int4) endif(USE_BITINT_EXTENSION_INT4) - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp) + add_example_executable(example_gemm_add_add_fastgelu_xdl_int8 gemm_add_add_fastgelu_xdl_int8.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_add_add_fastgelu_xdl example_gemm_add_add_fastgelu_xdl_int8) endif() set(target 1) diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index 1af1e6c858..f9903bfe03 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -2,34 +2,16 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) add_example_executable(example_convnd_fwd_xdl_fp32 convnd_fwd_xdl_fp32.cpp) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) - endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) add_example_executable(example_convnd_fwd_xdl_bf16 convnd_fwd_xdl_bf16.cpp) - endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) - endif() - # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed - if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) + # FIXME: re-enable this exampe as test when SWDEV-335738 is fixed add_example_executable_no_testing(example_convnd_fwd_xdl_fp64 convnd_fwd_xdl_fp64.cpp) - endif() set(target 1) endif() endforeach() -if(DL_KERNELS) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp) - endif() - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp) - endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp) - endif() -endif() +add_example_executable(example_convnd_fwd_dl_fp16 convnd_fwd_dl_fp16.cpp) +add_example_executable(example_convnd_fwd_dl_fp32 convnd_fwd_dl_fp32.cpp) +add_example_executable(example_convnd_fwd_dl_int8 convnd_fwd_dl_int8.cpp) diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt index e7d941ae6b..150d146e31 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/CMakeLists.txt @@ -2,27 +2,27 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - add_custom_target(example_convnd_fwd_reduce_xdl) - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp) + add_custom_target(example_convnd_fwd_reduce_xdl) + add_example_executable(example_convnd_fwd_max_xdl_int8 convnd_fwd_max_xdl_int8.cpp) + if(result EQUAL 0) add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int8) - endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp) + endif() + add_example_executable_no_testing(example_convnd_fwd_max_xdl_bf16 convnd_fwd_max_xdl_bf16.cpp) + if(result EQUAL 0) add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_bf16) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp) + endif() + add_example_executable_no_testing(example_convnd_fwd_max_xdl_fp16 convnd_fwd_max_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp16) - endif() - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp) + endif() + add_example_executable(example_convnd_fwd_max_xdl_fp32 convnd_fwd_max_xdl_fp32.cpp) + if(result EQUAL 0) add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_fp32) - endif() - if(USE_BITINT_EXTENSION_INT4) + endif() + if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_convnd_fwd_max_xdl_int4 convnd_fwd_max_xdl_int4.cpp) add_dependencies(example_convnd_fwd_reduce_xdl example_convnd_fwd_max_xdl_int4) - endif(USE_BITINT_EXTENSION_INT4) - set(target 1) + endif(USE_BITINT_EXTENSION_INT4) + set(target 1) endif() endforeach() \ No newline at end of file diff --git a/example/13_pool2d_fwd/CMakeLists.txt b/example/13_pool2d_fwd/CMakeLists.txt index d0f356757b..e2a923cded 100644 --- a/example/13_pool2d_fwd/CMakeLists.txt +++ b/example/13_pool2d_fwd/CMakeLists.txt @@ -1,6 +1,2 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp) -endif() +add_example_executable(example_pool2d_fwd_fp16 pool2d_fwd_fp16.cpp) +add_example_executable(example_pool2d_fwd_fp32 pool2d_fwd_fp32.cpp) diff --git a/example/14_gemm_quantization/CMakeLists.txt b/example/14_gemm_quantization/CMakeLists.txt index 3b3ad80dd8..9793e8b8a0 100644 --- a/example/14_gemm_quantization/CMakeLists.txt +++ b/example/14_gemm_quantization/CMakeLists.txt @@ -1,9 +1,5 @@ -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) # dlops -if(DL_KERNELS) - add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp) -endif() - +add_example_executable(example_gemm_dl_quantization_int8 gemm_dl_quantization_int8.cpp) # xdlops list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) @@ -14,4 +10,3 @@ foreach(gpu IN LISTS GPU_TARGETS) set(target 1) endif() endforeach() -endif() \ No newline at end of file diff --git a/example/15_grouped_gemm/CMakeLists.txt b/example/15_grouped_gemm/CMakeLists.txt index 45e02126b6..f2c76b76f2 100644 --- a/example/15_grouped_gemm/CMakeLists.txt +++ b/example/15_grouped_gemm/CMakeLists.txt @@ -1,36 +1,44 @@ add_custom_target(example_grouped_gemm_xdl) - -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp) +add_example_executable(example_grouped_gemm_xdl_fp32 grouped_gemm_xdl_fp32.cpp) +if(result EQUAL 0) add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp32) endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp) - add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp) - add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp) - add_example_executable(example_grouped_gemm_xdl_fixed_nk_fp16 grouped_gemm_xdl_fixed_nk_fp16.cpp) - add_example_executable(example_grouped_gemm_xdl_fixed_nk_bias_fp16 grouped_gemm_xdl_fixed_nk_bias_fp16.cpp) - add_dependencies(example_grouped_gemm_xdl - example_grouped_gemm_xdl_fp16 - example_grouped_gemm_multiple_d_dl_fp16 - example_grouped_gemm_xdl_splitk_fp16 - example_grouped_gemm_xdl_fixed_nk_fp16 - example_grouped_gemm_xdl_fixed_nk_bias_fp16) +add_example_executable(example_grouped_gemm_xdl_fp16 grouped_gemm_xdl_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fp16) endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_gemm_xdl_bfp16 grouped_gemm_xdl_bfp16.cpp) - add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_bfp16) +add_example_executable(example_grouped_gemm_multiple_d_dl_fp16 grouped_gemm_multiple_d_dl_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_multiple_d_dl_fp16) endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp) +add_example_executable(example_grouped_gemm_xdl_splitk_fp16 grouped_gemm_xdl_splitk_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_splitk_fp16) +endif() +add_example_executable(example_grouped_gemm_xdl_fixed_nk_fp16 grouped_gemm_xdl_fixed_nk_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fixed_nk_fp16) +endif() +add_example_executable(example_grouped_gemm_xdl_fixed_nk_bias_fp16 grouped_gemm_xdl_fixed_nk_bias_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fixed_nk_bias_fp16) +endif() +add_example_executable(example_grouped_gemm_xdl_bf16 grouped_gemm_xdl_bf16.cpp) +if(result EQUAL 0) + add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_bf16) +endif() +add_example_executable(example_grouped_gemm_xdl_int8 grouped_gemm_xdl_int8.cpp) +if(result EQUAL 0) add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int8) endif() -if(DTYPES MATCHES "f8" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_gemm_xdl_fixed_nk_fp8 grouped_gemm_xdl_fixed_nk_fp8.cpp) +add_example_executable(example_grouped_gemm_xdl_fixed_nk_fp8 grouped_gemm_xdl_fixed_nk_fp8.cpp) +if(result EQUAL 0) add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_fixed_nk_fp8) endif() if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_grouped_gemm_xdl_int4 grouped_gemm_xdl_int4.cpp) - add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4) + if(result EQUAL 0) + add_dependencies(example_grouped_gemm_xdl example_grouped_gemm_xdl_int4) + endif() endif() diff --git a/example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp b/example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp similarity index 100% rename from example/15_grouped_gemm/grouped_gemm_xdl_bfp16.cpp rename to example/15_grouped_gemm/grouped_gemm_xdl_bf16.cpp diff --git a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt index 00786d34a3..67cf5666d7 100644 --- a/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt +++ b/example/16_gemm_multi_d_multi_reduces/CMakeLists.txt @@ -6,30 +6,43 @@ foreach(gpu IN LISTS GPU_TARGETS) add_custom_target(example_gemm_reduce_xdl_max) add_custom_target(example_gemm_reduce_xdl_mean_meansquare) add_custom_target(example_gemm_add_add_mean_meansquare_xdl) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp) - add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp) - add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp) + add_example_executable(example_gemm_max_xdl_fp16 gemm_max_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_fp16) + endif() + add_example_executable(example_gemm_add_add_mean_meansquare_xdl_fp16 gemm_add_add_mean_meansquare_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_add_add_mean_meansquare_xdl example_gemm_add_add_mean_meansquare_xdl_fp16) + endif() + add_example_executable(example_gemm_mean_meansquare_xdl_fp16 gemm_mean_meansquare_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp16) endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp) - add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp) + + add_example_executable(example_gemm_max_xdl_int8 gemm_max_xdl_int8.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int8) + endif() + add_example_executable(example_gemm_add_addsquare_xdl_int8 gemm_add_addsquare_xdl_int8.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_add_addsquare_xdl_int8) endif() - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp) - add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp) + + add_example_executable(example_gemm_max_xdl_fp32 gemm_max_xdl_fp32.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_fp32) + endif() + add_example_executable(example_gemm_mean_meansquare_xdl_fp32 gemm_mean_meansquare_xdl_fp32.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_fp32) endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp) - add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp) + + add_example_executable(example_gemm_max_xdl_bf16 gemm_max_xdl_bf16.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_bf16) + endif() + add_example_executable(example_gemm_mean_meansquare_xdl_bf16 gemm_mean_meansquare_xdl_bf16.cpp) + if(result EQUAL 0) add_dependencies(example_gemm_reduce_xdl_mean_meansquare example_gemm_mean_meansquare_xdl_bf16) endif() @@ -40,7 +53,9 @@ foreach(gpu IN LISTS GPU_TARGETS) if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_gemm_max_xdl_int4 gemm_max_xdl_int4.cpp) - add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4) + if(result EQUAL 0) + add_dependencies(example_gemm_reduce_xdl_max example_gemm_max_xdl_int4) + endif() endif() set(target 1) endif() diff --git a/example/17_convnd_bwd_data/CMakeLists.txt b/example/17_convnd_bwd_data/CMakeLists.txt index e187bd4337..7c6d10d8a0 100644 --- a/example/17_convnd_bwd_data/CMakeLists.txt +++ b/example/17_convnd_bwd_data/CMakeLists.txt @@ -1,15 +1,16 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) add_example_executable(example_convnd_bwd_data_xdl_fp16 convnd_bwd_data_xdl_fp16.cpp) - target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility) + if(result EQUAL 0) + target_link_libraries(example_convnd_bwd_data_xdl_fp16 PRIVATE utility) + endif() set(target 1) endif() endforeach() - if(DL_KERNELS) - add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp) - target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility) - endif() + +add_example_executable(example_convnd_bwd_data_dl_fp16 convnd_bwd_data_dl_fp16.cpp) +if(result EQUAL 0) + target_link_libraries(example_convnd_bwd_data_dl_fp16 PRIVATE utility) endif() diff --git a/example/18_batched_gemm_reduce/CMakeLists.txt b/example/18_batched_gemm_reduce/CMakeLists.txt index a1bb398af0..94ed129dc0 100644 --- a/example/18_batched_gemm_reduce/CMakeLists.txt +++ b/example/18_batched_gemm_reduce/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) @@ -7,4 +6,3 @@ foreach(gpu IN LISTS GPU_TARGETS) set(target 1) endif() endforeach() -endif() diff --git a/example/20_grouped_conv_bwd_weight/CMakeLists.txt b/example/20_grouped_conv_bwd_weight/CMakeLists.txt index d649567ed2..ab8ed5c2bf 100644 --- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt +++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt @@ -3,22 +3,20 @@ set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) add_custom_target(example_grouped_conv_bwd_weight) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp) + add_example_executable(example_grouped_conv_bwd_weight_xdl_fp16 grouped_conv_bwd_weight_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_fp16) endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp) + add_example_executable(example_grouped_conv_bwd_weight_xdl_bf16 grouped_conv_bwd_weight_xdl_bf16.cpp) + if(result EQUAL 0) add_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_xdl_bf16) endif() set(target 1) endif() endforeach() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - if(DL_KERNELS) - add_custom_target(example_grouped_conv_bwd_weight_dl) - add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp) - add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16) - endif() -endif() \ No newline at end of file +add_custom_target(example_grouped_conv_bwd_weight_dl) +add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp) +if(result EQUAL 0) + add_dependencies(example_grouped_conv_bwd_weight_dl example_grouped_conv_bwd_weight_dl_fp16) +endif() diff --git a/example/21_gemm_layernorm/CMakeLists.txt b/example/21_gemm_layernorm/CMakeLists.txt index 6a6735efd6..e231bc619b 100644 --- a/example/21_gemm_layernorm/CMakeLists.txt +++ b/example/21_gemm_layernorm/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) @@ -10,4 +9,4 @@ foreach(gpu IN LISTS GPU_TARGETS) set(target 1) endif() endforeach() -endif() + diff --git a/example/22_cgemm/CMakeLists.txt b/example/22_cgemm/CMakeLists.txt index 854f07fda6..f401f7187f 100644 --- a/example/22_cgemm/CMakeLists.txt +++ b/example/22_cgemm/CMakeLists.txt @@ -1,19 +1,19 @@ add_custom_target(example_cgemm_xdl) -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp) +add_example_executable(example_cgemm_xdl_bf16 cgemm_xdl_bf16.cpp) +if(result EQUAL 0) add_dependencies(example_cgemm_xdl example_cgemm_xdl_bf16) endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp) +add_example_executable(example_cgemm_xdl_fp16 cgemm_xdl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp16) endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) add_example_executable(example_cgemm_xdl_fp32 cgemm_xdl_fp32.cpp) -add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32) +if(result EQUAL 0) + add_dependencies(example_cgemm_xdl example_cgemm_xdl_fp32) endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp) +add_example_executable(example_cgemm_xdl_int8 cgemm_xdl_int8.cpp) +if(result EQUAL 0) add_dependencies(example_cgemm_xdl example_cgemm_xdl_int8) endif() if(USE_BITINT_EXTENSION_INT4) diff --git a/example/24_batched_gemm/CMakeLists.txt b/example/24_batched_gemm/CMakeLists.txt index 48a3b58ff5..c80540de7d 100644 --- a/example/24_batched_gemm/CMakeLists.txt +++ b/example/24_batched_gemm/CMakeLists.txt @@ -1,21 +1,23 @@ add_custom_target(example_batched_gemm_xdl) -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp) +add_example_executable(example_batched_gemm_xdl_fp32 batched_gemm_xdl_fp32.cpp) +if(result EQUAL 0) add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp32) endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp) +add_example_executable(example_batched_gemm_xdl_fp16 batched_gemm_xdl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_fp16) endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_xdl_bfp16 batched_gemm_xdl_bfp16.cpp) - add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bfp16) +add_example_executable(example_batched_gemm_xdl_bf16 batched_gemm_xdl_bf16.cpp) +if(result EQUAL 0) + add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_bf16) endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp) +add_example_executable(example_batched_gemm_xdl_int8 batched_gemm_xdl_int8.cpp) +if(result EQUAL 0) add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int8) endif() if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_batched_gemm_xdl_int4 batched_gemm_xdl_int4.cpp) - add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4) + if(result EQUAL 0) + add_dependencies(example_batched_gemm_xdl example_batched_gemm_xdl_int4) + endif() endif() diff --git a/example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp b/example/24_batched_gemm/batched_gemm_xdl_bf16.cpp similarity index 100% rename from example/24_batched_gemm/batched_gemm_xdl_bfp16.cpp rename to example/24_batched_gemm/batched_gemm_xdl_bf16.cpp diff --git a/example/25_gemm_bias_e_permute/CMakeLists.txt b/example/25_gemm_bias_e_permute/CMakeLists.txt index eb274b2338..cbc3c007bc 100644 --- a/example/25_gemm_bias_e_permute/CMakeLists.txt +++ b/example/25_gemm_bias_e_permute/CMakeLists.txt @@ -1,4 +1,2 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_gemm_bias_e_permute_g1m3n2k1_xdl_fp16 gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp) - add_example_executable(example_gemm_bias_e_permute_g1m2n3k1_xdl_fp16 gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp) -endif() +add_example_executable(example_gemm_bias_e_permute_g1m3n2k1_xdl_fp16 gemm_bias_e_permute_g1m3n2k1_xdl_fp16.cpp) +add_example_executable(example_gemm_bias_e_permute_g1m2n3k1_xdl_fp16 gemm_bias_e_permute_g1m2n3k1_xdl_fp16.cpp) diff --git a/example/26_contraction/CMakeLists.txt b/example/26_contraction/CMakeLists.txt index 6cab88b13e..167a9f147d 100644 --- a/example/26_contraction/CMakeLists.txt +++ b/example/26_contraction/CMakeLists.txt @@ -1,8 +1,4 @@ -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp) - add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp) -endif() -if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - add_example_executable(example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp) - add_example_executable(example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp) -endif() +add_example_executable(example_contraction_bilinear_xdl_fp32 contraction_bilinear_xdl_fp32.cpp) +add_example_executable(example_contraction_scale_xdl_fp32 contraction_scale_xdl_fp32.cpp) +add_example_executable(example_contraction_bilinear_xdl_fp64 contraction_bilinear_xdl_fp64.cpp) +add_example_executable(example_contraction_scale_xdl_fp64 contraction_scale_xdl_fp64.cpp) diff --git a/example/27_layernorm/CMakeLists.txt b/example/27_layernorm/CMakeLists.txt index 9cb2cd0766..94c23ce774 100644 --- a/example/27_layernorm/CMakeLists.txt +++ b/example/27_layernorm/CMakeLists.txt @@ -1,4 +1,2 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp) - add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp) -endif() +add_example_executable(example_layernorm_fp16 layernorm_fp16.cpp) +add_example_executable(example_layernorm_splitk_fp16 layernorm_splitk_fp16.cpp) diff --git a/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt b/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt index 2fda1f62a9..44ab16894c 100644 --- a/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt +++ b/example/28_grouped_gemm_bias_e_permute/CMakeLists.txt @@ -1,3 +1 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_gemm_bias_e_permute_xdl_fp16 grouped_gemm_bias_e_permute_xdl_fp16.cpp) -endif() +add_example_executable(example_grouped_gemm_bias_e_permute_xdl_fp16 grouped_gemm_bias_e_permute_xdl_fp16.cpp) diff --git a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt index 09c3e6c608..32a87dd200 100644 --- a/example/29_batched_gemm_bias_e_permute/CMakeLists.txt +++ b/example/29_batched_gemm_bias_e_permute/CMakeLists.txt @@ -1,7 +1,5 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp) +add_example_executable(example_batched_gemm_bias_e_permute_xdl_fp16 batched_gemm_bias_e_permute_xdl_fp16.cpp) - if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") - add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp) - endif() +if(GPU_TARGETS MATCHES "gfx1100" OR GPU_TARGETS MATCHES "gfx1101" OR GPU_TARGETS MATCHES "gfx1102") + add_example_executable(example_batched_gemm_bias_e_permute_wmma_fp16 batched_gemm_bias_e_permute_wmma_fp16.cpp) endif() diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt index e37413c098..4ab5bd75f2 100644 --- a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt +++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt @@ -5,27 +5,31 @@ set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list1 AND target EQUAL 0) add_custom_target(example_grouped_conv_fwd_multiple_d) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp) + add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16) - add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp) + endif() + add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16) endif() - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp) + add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp) + if(result EQUAL 0) add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32) endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp) + add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp) + if(result EQUAL 0) add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16) endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp) + add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp) + if(result EQUAL 0) add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8) endif() if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp) - add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4) + if(result EQUAL 0) + add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4) + endif() endif() # USE_BITINT_EXTENSION_INT4 set(target 1) @@ -35,12 +39,8 @@ endforeach() set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list2 AND target EQUAL 0) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp) - endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp) - endif() + add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_fp16 grouped_conv_fwd_bias_relu_add_wmma_fp16.cpp) + add_example_executable(example_grouped_conv_fwd_bias_relu_add_wmma_int8 grouped_conv_fwd_bias_relu_add_wmma_int8.cpp) set(target 1) endif() endforeach() diff --git a/example/31_batched_gemm_gemm/CMakeLists.txt b/example/31_batched_gemm_gemm/CMakeLists.txt index 2074520f8c..93f16c945f 100644 --- a/example/31_batched_gemm_gemm/CMakeLists.txt +++ b/example/31_batched_gemm_gemm/CMakeLists.txt @@ -1,17 +1,11 @@ list(APPEND gpu_list1 gfx908 gfx90a gfx940 gfx941 gfx942) -list(APPEND gpu_list2 gfx908 gfx90a) + set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list1 AND target EQUAL 0) - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp) - endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp) - endif() + add_example_executable(example_batched_gemm_gemm_xdl_fp32 batched_gemm_gemm_xdl_fp32.cpp) + add_example_executable(example_batched_gemm_gemm_xdl_fp16 batched_gemm_gemm_xdl_fp16.cpp) + add_example_executable(example_batched_gemm_gemm_xdl_bf16 batched_gemm_gemm_xdl_bf16.cpp) if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_batched_gemm_gemm_xdl_int4 batched_gemm_gemm_xdl_int4.cpp) endif(USE_BITINT_EXTENSION_INT4) @@ -20,7 +14,5 @@ foreach(gpu IN LISTS GPU_TARGETS) endforeach() if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1") - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp) - endif() + add_example_executable(example_batched_gemm_gemm_xdl_int8 batched_gemm_gemm_xdl_int8.cpp) endif() diff --git a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt index 0463bf6bd3..25eb44ae59 100644 --- a/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt +++ b/example/32_batched_gemm_scale_softmax_gemm/CMakeLists.txt @@ -1,24 +1,31 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp) - add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) - add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) - add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) - add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp) - add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp) -endif() - add_custom_target(example_gemm_scale_softmax_gemm) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) + +add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_fp16 batched_gemm_scale_softmax_gemm_xdl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_fp16) +endif() +add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_fp16) +endif() +add_example_executable(example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_scale_softmax_gemm_permute_xdl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_scale_softmax_gemm_permute_xdl_fp16) +endif() +add_example_executable(example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) +endif() +add_example_executable(example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16 grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_scale_softmax_gemm example_grouped_gemm_lower_triangle_scale_softmax_gemm_permute_xdl_fp16) endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) +add_example_executable(example_batched_gemm_scale_softmax_gemm_xdl_bf16 batched_gemm_scale_softmax_gemm_xdl_bf16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_xdl_bf16) +endif() +add_example_executable(example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16 batched_gemm_scale_softmax_gemm_permute_xdl_bf16.cpp) +if(result EQUAL 0) add_dependencies(example_gemm_scale_softmax_gemm example_batched_gemm_scale_softmax_gemm_permute_xdl_bf16) endif() + diff --git a/example/35_splitK_gemm/CMakeLists.txt b/example/35_splitK_gemm/CMakeLists.txt index 251a9b93c5..8970a57648 100644 --- a/example/35_splitK_gemm/CMakeLists.txt +++ b/example/35_splitK_gemm/CMakeLists.txt @@ -3,25 +3,28 @@ set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) add_custom_target(example_splitK_gemm_xdl) - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp) + + add_example_executable(example_splitK_gemm_xdl_fp32 splitK_gemm_xdl_fp32.cpp) + if(result EQUAL 0) add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_fp32) endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp) + add_example_executable(example_splitK_gemm_xdl_fp16 splitK_gemm_xdl_fp16.cpp) + if(result EQUAL 0) add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_fp16) endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_splitK_gemm_xdl_bfp16 splitK_gemm_xdl_bfp16.cpp) - add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_bfp16) + add_example_executable(example_splitK_gemm_xdl_bf16 splitK_gemm_xdl_bf16.cpp) + if(result EQUAL 0) + add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_bf16) endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp) + add_example_executable(example_splitK_gemm_xdl_int8 splitK_gemm_xdl_int8.cpp) + if(result EQUAL 0) add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int8) endif() if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_splitK_gemm_xdl_int4 splitK_gemm_xdl_int4.cpp) - add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4) + if(result EQUAL 0) + add_dependencies(example_splitK_gemm_xdl example_splitK_gemm_xdl_int4) + endif() endif() set(target 1) endif() diff --git a/example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp b/example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp similarity index 100% rename from example/35_splitK_gemm/splitK_gemm_xdl_bfp16.cpp rename to example/35_splitK_gemm/splitK_gemm_xdl_bf16.cpp diff --git a/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt b/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt index 36bb5720d5..a9be3a7108 100644 --- a/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt +++ b/example/37_batched_gemm_add_add_relu_gemm_add/CMakeLists.txt @@ -1,3 +1 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp) -endif() +add_example_executable(example_batched_gemm_add_add_relu_gemm_add_xdl_fp16 batched_gemm_add_add_relu_gemm_add_xdl_fp16.cpp) diff --git a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt index 3821f8aaca..1675e3663a 100644 --- a/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt +++ b/example/38_grouped_conv_bwd_data_multiple_d/CMakeLists.txt @@ -1,15 +1,16 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) add_custom_target(example_grouped_conv_bwd_data) add_example_executable(example_grouped_conv_bwd_data_fp16 grouped_conv_bwd_data_fp16.cpp) + if(result EQUAL 0) + add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16) + endif() add_example_executable(example_grouped_conv_bwd_data_bias_relu_fp16 grouped_conv_bwd_data_bias_relu_fp16.cpp) - - add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_fp16) - add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16) + if(result EQUAL 0) + add_dependencies(example_grouped_conv_bwd_data example_grouped_conv_bwd_data_bias_relu_fp16) + endif() set(target 1) endif() endforeach() -endif() diff --git a/example/39_permute/CMakeLists.txt b/example/39_permute/CMakeLists.txt index 5b43de9725..bcf47b4926 100644 --- a/example/39_permute/CMakeLists.txt +++ b/example/39_permute/CMakeLists.txt @@ -1,11 +1,14 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_custom_target(example_permute) - - add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp) - add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp) - add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp) +add_custom_target(example_permute) +add_example_executable(example_permute_1xHxW_fp16 permute_1xHxW_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_permute example_permute_1xHxW_fp16) +endif() +add_example_executable(example_permute_NxHxW_fp16 permute_NxHxW_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_permute example_permute_NxHxW_fp16) +endif() +add_example_executable(example_permute_HxWx4_fp16 permute_HxWx4_fp16.cpp) +if(result EQUAL 0) add_dependencies(example_permute example_permute_HxWx4_fp16) endif() diff --git a/example/40_conv2d_fwd_quantization/CMakeLists.txt b/example/40_conv2d_fwd_quantization/CMakeLists.txt index 55464957ac..2d804cafe9 100644 --- a/example/40_conv2d_fwd_quantization/CMakeLists.txt +++ b/example/40_conv2d_fwd_quantization/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) @@ -11,7 +10,6 @@ foreach(gpu IN LISTS GPU_TARGETS) endif() endforeach() - if(DL_KERNELS) # Conv perlayer quantization add_example_executable(example_conv2d_fwd_dl_perlayer_quantization_int8 conv2d_fwd_dl_perlayer_quantization_int8.cpp) # Conv perchannel quantization @@ -24,5 +22,3 @@ endforeach() add_example_executable(example_conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8 conv2d_fwd_dl_bias_tanh_perlayer_quantization_int8.cpp) # Conv + bias + tanh perchannel quantization add_example_executable(example_conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8 conv2d_fwd_dl_bias_tanh_perchannel_quantization_int8.cpp) - endif() -endif() \ No newline at end of file diff --git a/example/41_grouped_conv_conv_fwd/CMakeLists.txt b/example/41_grouped_conv_conv_fwd/CMakeLists.txt index 085d90a9e9..ae251e88d2 100644 --- a/example/41_grouped_conv_conv_fwd/CMakeLists.txt +++ b/example/41_grouped_conv_conv_fwd/CMakeLists.txt @@ -3,15 +3,9 @@ list(APPEND gpu_list2 gfx908 gfx90a) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list1 AND target EQUAL 0) - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp) - endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp) - endif() + add_example_executable(example_grouped_conv_conv_fwd_xdl_fp32 grouped_conv_conv_fwd_xdl_fp32.cpp) + add_example_executable(example_grouped_conv_conv_fwd_xdl_fp16 grouped_conv_conv_fwd_xdl_fp16.cpp) + add_example_executable(example_grouped_conv_conv_fwd_xdl_bf16 grouped_conv_conv_fwd_xdl_bf16.cpp) if(USE_BITINT_EXTENSION_INT4) add_example_executable(example_grouped_conv_conv_fwd_xdl_int4 grouped_conv_conv_fwd_xdl_int4.cpp) endif(USE_BITINT_EXTENSION_INT4) @@ -20,7 +14,5 @@ foreach(gpu IN LISTS GPU_TARGETS) endforeach() if(NOT GPU_TARGETS MATCHES "gfx94" AND NOT GPU_TARGETS MATCHES "gfx1") - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp) - endif() + add_example_executable(example_grouped_conv_conv_fwd_xdl_int8 grouped_conv_conv_fwd_xdl_int8.cpp) endif() diff --git a/example/42_groupnorm/CMakeLists.txt b/example/42_groupnorm/CMakeLists.txt index bc2246a2bf..e8c306ac58 100644 --- a/example/42_groupnorm/CMakeLists.txt +++ b/example/42_groupnorm/CMakeLists.txt @@ -1,5 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp) - add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp) - add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp) -endif() +add_example_executable(example_groupnorm_sigmoid_mul_fp16 groupnorm_sigmoid_mul_fp16.cpp) +add_example_executable(example_groupnorm_splitk_fp16 groupnorm_splitk_fp16.cpp) +add_example_executable(example_groupnorm_swish_fp16 groupnorm_swish_fp16.cpp) diff --git a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt index 7e070f5357..c29f18f162 100644 --- a/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt +++ b/example/43_splitk_gemm_bias_e_permute/CMakeLists.txt @@ -1,6 +1,2 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp) -endif() +add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp16 splitk_gemm_bias_e_permute_xdl_fp16.cpp) +add_example_executable(example_splitk_gemm_bias_e_permute_xdl_fp32 splitk_gemm_bias_e_permute_xdl_fp32.cpp) diff --git a/example/44_elementwise_permute/CMakeLists.txt b/example/44_elementwise_permute/CMakeLists.txt index 877a820315..0e0091a986 100644 --- a/example/44_elementwise_permute/CMakeLists.txt +++ b/example/44_elementwise_permute/CMakeLists.txt @@ -1,4 +1,2 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp) - add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp) -endif() +add_example_executable(example_elementwise_permute_4D_fp16 elementwise_permute_4D_fp16.cpp) +add_example_executable(example_elementwise_permute_4D_fp16_2d elementwise_permute_4D_fp16_2d.cpp) diff --git a/example/46_gemm_add_multiply/CMakeLists.txt b/example/46_gemm_add_multiply/CMakeLists.txt index cf7d81f895..bfe057e8da 100644 --- a/example/46_gemm_add_multiply/CMakeLists.txt +++ b/example/46_gemm_add_multiply/CMakeLists.txt @@ -1,6 +1,2 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - if(DL_KERNELS) - add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp) - endif() - add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp) -endif() +add_example_executable(example_gemm_add_multiply_dl_fp16 gemm_add_multiply_dl_fp16.cpp) +add_example_executable(example_gemm_add_multiply_xdl_fp16 gemm_add_multiply_xdl_fp16.cpp) diff --git a/example/48_pool3d_fwd/CMakeLists.txt b/example/48_pool3d_fwd/CMakeLists.txt index f821f2c97b..492cb4d37e 100644 --- a/example/48_pool3d_fwd/CMakeLists.txt +++ b/example/48_pool3d_fwd/CMakeLists.txt @@ -1,3 +1 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp) -endif() +add_example_executable(example_pool3d_fwd_fp16 pool3d_fwd_fp16.cpp) diff --git a/example/49_maxpool2d_bwd/CMakeLists.txt b/example/49_maxpool2d_bwd/CMakeLists.txt index fe98b7c997..b29cf9ccbc 100644 --- a/example/49_maxpool2d_bwd/CMakeLists.txt +++ b/example/49_maxpool2d_bwd/CMakeLists.txt @@ -1,9 +1,3 @@ -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp) -endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp) -endif() +add_example_executable(example_maxpool2d_bwd_bf16 maxpool2d_bwd_bf16.cpp) +add_example_executable(example_maxpool2d_bwd_fp16 maxpool2d_bwd_fp16.cpp) +add_example_executable(example_maxpool2d_bwd_fp32 maxpool2d_bwd_fp32.cpp) diff --git a/example/50_put_element/CMakeLists.txt b/example/50_put_element/CMakeLists.txt index eca410008f..1b0020ebcf 100644 --- a/example/50_put_element/CMakeLists.txt +++ b/example/50_put_element/CMakeLists.txt @@ -1,3 +1 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_example_executable(example_put_element_fp16 put_element_fp16.cpp) -endif() +add_example_executable(example_put_element_fp16 put_element_fp16.cpp) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 1fdd2f6d14..d030c53453 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -7,20 +7,114 @@ add_custom_target(examples) function(add_example_executable EXAMPLE_NAME FILE_NAME) message("adding example ${EXAMPLE_NAME}") - add_executable(${EXAMPLE_NAME} ${FILE_NAME}) - target_link_libraries(${EXAMPLE_NAME} PRIVATE utility) - add_test(NAME ${EXAMPLE_NAME} COMMAND $ ${ARGN}) - add_dependencies(examples ${EXAMPLE_NAME}) - add_dependencies(check ${EXAMPLE_NAME}) - rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples) + set(result 1) + if(DEFINED DTYPES) + foreach(source IN LISTS FILE_NAME) + set(test 0) + foreach(type IN LISTS DTYPES) + if(type MATCHES "fp16") + set(type1 "_f16") + elseif(type MATCHES "fp32") + set(type1 "_f32") + elseif(type MATCHES "fp8") + set(type1 "_f8") + elseif(type MATCHES "bf16") + set(type1 "_b16") + elseif(type MATCHES "fp64") + set(type1 "_f64") + elseif(type MATCHES "int8") + set(type1 "_i8") + endif() + if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}") + #if filename matches any selected type, exit type loop and do no exclude the file from the list + set(test 0) + break() + elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR + source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND + NOT(source MATCHES type OR source MATCHES type1)) + #if filename contains a type which doesn't match any selected type, mark it for removal + set(test 1) + endif() + endforeach() + if(test EQUAL 1) + message("removing example source file ${source} ") + list(REMOVE_ITEM FILE_NAME "${source}") + endif() + endforeach() + endif() + foreach(source IN LISTS FILE_NAME) + if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") + message("removing dl example ${source} ") + list(REMOVE_ITEM FILE_NAME "${source}") + endif() + endforeach() + #only continue if there are some source files left on the list + if(FILE_NAME) + add_executable(${EXAMPLE_NAME} ${FILE_NAME}) + target_link_libraries(${EXAMPLE_NAME} PRIVATE utility) + add_test(NAME ${EXAMPLE_NAME} COMMAND $ ${ARGN}) + add_dependencies(examples ${EXAMPLE_NAME}) + add_dependencies(check ${EXAMPLE_NAME}) + rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples) + set(result 0) + endif() + #message("add_example returns ${result}") + return(PROPAGATE result) endfunction(add_example_executable EXAMPLE_NAME) function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME) message("adding example ${EXAMPLE_NAME}") - add_executable(${EXAMPLE_NAME} ${FILE_NAME}) - target_link_libraries(${EXAMPLE_NAME} PRIVATE utility) - add_dependencies(examples ${EXAMPLE_NAME}) - rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples) + set(result 1) + if(DEFINED DTYPES) + foreach(source IN LISTS FILE_NAME) + set(test 0) + foreach(type IN LISTS DTYPES) + if(type MATCHES "fp16") + set(type1 "_f16") + elseif(type MATCHES "fp32") + set(type1 "_f32") + elseif(type MATCHES "fp8") + set(type1 "_f8") + elseif(type MATCHES "bf16") + set(type1 "_b16") + elseif(type MATCHES "fp64") + set(type1 "_f64") + elseif(type MATCHES "int8") + set(type1 "_i8") + endif() + if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}") + #if filename matches any selected type, exit type loop and do no exclude the file from the list + set(test 0) + break() + elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR + source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND + NOT(source MATCHES type OR source MATCHES type1)) + #if filename contains a type which doesn't match any selected type, mark it for removal + set(test 1) + endif() + endforeach() + if(test EQUAL 1) + message("removing example ${source} ") + list(REMOVE_ITEM FILE_NAME "${source}") + endif() + endforeach() + endif() + foreach(source IN LISTS FILE_NAME) + if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") + message("removing dl example ${source} ") + list(REMOVE_ITEM FILE_NAME "${source}") + endif() + endforeach() + #only continue if there are some source files left on the list + if(FILE_NAME) + add_executable(${EXAMPLE_NAME} ${FILE_NAME}) + target_link_libraries(${EXAMPLE_NAME} PRIVATE utility) + add_dependencies(examples ${EXAMPLE_NAME}) + rocm_install(TARGETS ${EXAMPLE_NAME} COMPONENT examples) + set(result 0) + endif() + #message("add_example returns ${result}") + return(PROPAGATE result) endfunction(add_example_executable_no_testing EXAMPLE_NAME) # add all example subdir diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp index 0e1f6f04e8..efb76df256 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_backward.hpp @@ -16,26 +16,26 @@ namespace tensor_operation { namespace device { namespace instance { -// FP16 +#ifdef CK_ENABLE_FP16 void add_device_batchnorm_backward_rank_4_3_f16_instances( std::vector>>&); - -// FP32 +#endif +#ifdef CK_ENABLE_FP32 void add_device_batchnorm_backward_rank_4_3_f32_instances( std::vector>>&); - -// BF16 +#endif +#ifdef CK_ENABLE_BF16 void add_device_batchnorm_backward_rank_4_3_bf16_instances( std::vector>>&); - -// FP64 +#endif +#ifdef CK_ENABLE_FP64 void add_device_batchnorm_backward_rank_4_3_f64_instances( std::vector>>&); - +#endif template > op_ptrs; - +#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v && is_same_v && is_same_v && is_same_v && @@ -83,37 +83,43 @@ struct DeviceOperationInstanceFactory< add_device_batchnorm_backward_rank_4_3_f16_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) { if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) { add_device_batchnorm_backward_rank_4_3_f32_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) { if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) { add_device_batchnorm_backward_rank_4_3_bf16_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP64 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) { if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) { add_device_batchnorm_backward_rank_4_3_f64_instances(op_ptrs); } } - +#endif return op_ptrs; } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp index 8fd1c7665d..08e35d722c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_forward.hpp @@ -16,26 +16,26 @@ namespace tensor_operation { namespace device { namespace instance { -// FP16 +#ifdef CK_ENABLE_FP16 void add_device_batchnorm_forward_rank_4_3_f16_instances( std::vector< std::unique_ptr>>&); - -// FP32 +#endif +#ifdef CK_ENABLE_FP32 void add_device_batchnorm_forward_rank_4_3_f32_instances( std::vector< std::unique_ptr>>&); - -// BF16 +#endif +#ifdef CK_ENABLE_BF16 void add_device_batchnorm_forward_rank_4_3_bf16_instances( std::vector< std::unique_ptr>>&); - -// FP64 +#endif +#ifdef CK_ENABLE_FP64 void add_device_batchnorm_forward_rank_4_3_f64_instances( std::vector< std::unique_ptr>>&); - +#endif template > op_ptrs; - +#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v && is_same_v && is_same_v && is_same_v) @@ -79,34 +79,40 @@ struct DeviceOperationInstanceFactory< add_device_batchnorm_forward_rank_4_3_f16_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v && is_same_v) +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) { if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) { add_device_batchnorm_forward_rank_4_3_f32_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v && is_same_v) +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) { if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) { add_device_batchnorm_forward_rank_4_3_bf16_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v && is_same_v) +#endif +#ifdef CK_ENABLE_FP64 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v && is_same_v) { if constexpr(Rank == 4 && NumReduceDim == 3 && is_same_v) { add_device_batchnorm_forward_rank_4_3_f64_instances(op_ptrs); } } - +#endif return op_ptrs; } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp index f6f4df7e2e..97c216b4ad 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/batchnorm_infer.hpp @@ -16,38 +16,38 @@ namespace tensor_operation { namespace device { namespace instance { -// FP16 +#ifdef CK_ENABLE_FP16 void add_device_batchnorm_infer_rank_4_f16_instances( std::vector, ck::Tuple, ck::tensor_operation::element_wise::NormalizeInInfer, 4>>>&); - -// FP32 +#endif +#ifdef CK_ENABLE_FP32 void add_device_batchnorm_infer_rank_4_f32_instances( std::vector, ck::Tuple, ck::tensor_operation::element_wise::NormalizeInInfer, 4>>>&); - -// BF16 +#endif +#ifdef CK_ENABLE_BF16 void add_device_batchnorm_infer_rank_4_bf16_instances( std::vector, ck::Tuple, ck::tensor_operation::element_wise::NormalizeInInfer, 4>>>&); - -// FP64 +#endif +#ifdef CK_ENABLE_FP64 void add_device_batchnorm_infer_rank_4_f64_instances( std::vector, ck::Tuple, ck::tensor_operation::element_wise::NormalizeInInfer, 4>>>&); - +#endif template > op_ptrs; - +#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v && is_same_v && is_same_v) @@ -79,34 +79,40 @@ struct DeviceOperationInstanceFactory && is_same_v && - is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP32 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) { if constexpr(Rank == 4) { add_device_batchnorm_infer_rank_4_f32_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) { if constexpr(Rank == 4) { add_device_batchnorm_infer_rank_4_bf16_instances(op_ptrs); } } - else if constexpr(is_same_v && is_same_v && - is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP64 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) { if constexpr(Rank == 4) { add_device_batchnorm_infer_rank_4_f64_instances(op_ptrs); } } - +#endif return op_ptrs; } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp index d239cd7706..4cba0875d9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp @@ -16,7 +16,7 @@ namespace ck { namespace tensor_operation { namespace device { namespace instance { - +#ifdef CK_ENABLE_FP16 void add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances( std::vector>>& @@ -36,7 +36,8 @@ void add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances( std::vector>>& @@ -56,8 +57,8 @@ void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances( std::vector>>& instances); - -#if defined CK_ENABLE_FP8 +#endif +#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8)) void add_device_gemm_xdl_splitk_f8_f16_f16_km_kn_mn_instances( std::vector>>& @@ -129,7 +130,7 @@ struct DeviceOperationInstanceFactory< static auto GetInstances() { std::vector> op_ptrs; - +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { @@ -154,6 +155,8 @@ struct DeviceOperationInstanceFactory< add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs); } } +#endif +#ifdef CK_ENABLE_FP16 else if constexpr(is_same_v && is_same_v && is_same_v) { @@ -178,7 +181,8 @@ struct DeviceOperationInstanceFactory< add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs); } } -#if defined CK_ENABLE_FP8 +#endif +#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8)) else if constexpr(is_same_v && is_same_v && is_same_v) { @@ -228,7 +232,6 @@ struct DeviceOperationInstanceFactory< } } #endif - return op_ptrs; } }; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp index 58c9064535..2b49e17187 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp @@ -16,6 +16,7 @@ namespace device { namespace instance { // conv2d backward data +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances( std::vector>>& instances); - +#endif // conv3d backward data +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances( std::vector>>& instances); - +#endif template && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f16_instances(op_ptrs); } +#endif +#ifdef CK_ENABLE_FP32 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_f32_instances(op_ptrs); } +#endif +#ifdef CK_ENABLE_BF16 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_bwd_data_xdl_gnhwk_gkyxc_gnhwc_bf16_instances( op_ptrs); } +#endif } else if constexpr(is_same_v && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f16_instances(op_ptrs); } +#endif +#ifdef CK_ENABLE_FP32 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_f32_instances(op_ptrs); } +#endif +#ifdef CK_ENABLE_BF16 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_bwd_data_xdl_nhwgk_gkyxc_nhwgc_bf16_instances( op_ptrs); } +#endif } } else if constexpr(NumDimSpatial == 3) @@ -274,46 +298,58 @@ struct DeviceOperationInstanceFactory< if constexpr(is_same_v && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f16_instances( op_ptrs); } +#endif +#ifdef CK_ENABLE_FP32 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_f32_instances( op_ptrs); } +#endif +#ifdef CK_ENABLE_BF16 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_bwd_data_xdl_gndhwk_gkzyxc_gndhwc_bf16_instances( op_ptrs); } +#endif } else if constexpr(is_same_v && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f16_instances( op_ptrs); } +#endif +#ifdef CK_ENABLE_FP32 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_f32_instances( op_ptrs); } +#endif +#ifdef CK_ENABLE_BF16 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_bwd_data_xdl_ndhwgk_gkzyxc_ndhwgc_bf16_instances( op_ptrs); } +#endif } } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp index c7a694b4cc..c0a5b4bfe5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp @@ -19,6 +19,7 @@ namespace instance { // xdl // conv1d backward weight +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv1d_bwd_weight_xdl_gnwc_gkxc_gnwk_f32_instances( std::vector>>& instances); - +#endif // conv2d backward weight +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances( std::vector>>& instances); - +#endif // conv3d backward weight +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances( std::vector>>& instances); +#endif #ifdef DL_KERNELS // dl // conv1d backward weight +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv1d_bwd_weight_dl_gnwc_gkxc_gnwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instances( std::vector>>& instances); - +#endif // conv2d backward weight +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_weight_dl_gnhwc_gkyxc_gnhwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_bwd_weight_dl_nhwgc_gkyxc_nhwgk_f32_instances( std::vector>>& instances); - +#endif // conv3d backward weight +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_bwd_weight_dl_gndhwc_gkzyxc_gndhwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_bwd_weight_dl_ndhwgc_gkzyxc_ndhwgk_f32_instances( std::vector>>& instances); #endif +#endif template && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { @@ -470,6 +506,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -478,6 +516,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) @@ -489,21 +529,27 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { #ifdef DL_KERNELS +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f32_instances(op_ptrs); } +#endif +#ifdef CK_ENABLE_FP16 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv1d_bwd_weight_dl_nwgc_gkxc_nwgk_f16_instances(op_ptrs); } +#endif +#ifdef CK_ENABLE_BF16 else if constexpr(is_same_v && is_same_v && is_same_v) @@ -511,6 +557,7 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { @@ -529,6 +577,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -539,6 +589,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) @@ -550,10 +602,12 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { @@ -564,6 +618,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -574,6 +630,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) @@ -585,6 +643,7 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { @@ -602,6 +662,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -612,6 +674,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) @@ -623,10 +687,12 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { @@ -637,6 +703,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -647,6 +715,8 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) @@ -658,6 +728,7 @@ struct DeviceOperationInstanceFactory>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_INT8 void add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 // grouped conv2d forward, GNHWC/GKYXC/GNHWK void add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef DL_KERNELS +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances( std::vector>>& instances); - +#endif +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances( std::vector>>& instances); + +#ifdef DL_KERNELS void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances( std::vector>>& instances); - +#endif +#endif +#ifdef CK_ENABLE_INT8 void add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances( std::vector>>& instances); +#endif +#if(defined(CK_ENABLE_FP32) && defined(DL_KERNELS)) void add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances( std::vector>>& instances); +#endif // grouped conv2d forward, NHWGC/GKYXC/NHWGK +#ifdef CK_ENABLE_BF16 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 // grouped conv3d forward, GNDHWC/GKZYXC/GNDHWK void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_INT8 void add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_BF16 // grouped conv3d forward, NDHWGC/GKZYXC/NDHWGK void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP16 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_FP32 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances( std::vector>>& instances); - +#endif +#ifdef CK_ENABLE_INT8 void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances( std::vector>>& instances); +#endif template && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f32_instances(op_ptrs); } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_f16_instances(op_ptrs); } - else if constexpr(is_same_v && - is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) { add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_bf16_instances(op_ptrs); } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv1d_fwd_xdl_gnwc_gkxc_gnwk_int8_instances(op_ptrs); } +#endif } else if constexpr(NumDimSpatial == 2 && is_same_v && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs); +#ifdef DL_KERNELS add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f32_instances(op_ptrs); +#endif } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs); +#ifdef DL_KERNELS add_device_grouped_conv2d_fwd_dl_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs); +#endif add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_f16_instances(op_ptrs); } - else if constexpr(is_same_v && - is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) { add_device_grouped_conv1d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instances(op_ptrs); } +#endif +#ifdef CK_ENABLE_INT8 else if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_fwd_wmma_gnhwc_gkyxc_gnhwk_i8_instances(op_ptrs); } +#endif } else if constexpr(NumDimSpatial == 2 && is_same_v && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs); +#ifdef DL_KERNELS add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f32_instances(op_ptrs); +#endif } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs); +#ifdef DL_KERNELS add_device_grouped_conv2d_fwd_dl_nhwgc_gkyxc_nhwgk_f16_instances(op_ptrs); +#endif } - else if constexpr(is_same_v && - is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_BDF16 + if constexpr(is_same_v && + is_same_v && is_same_v) { add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(op_ptrs); } +#endif } else if constexpr(NumDimSpatial == 3 && is_same_v && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f32_instances(op_ptrs); } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_f16_instances(op_ptrs); } - else if constexpr(is_same_v && - is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) { add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_bf16_instances(op_ptrs); } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv3d_fwd_xdl_gndhwc_gkzyxc_gndhwk_int8_instances(op_ptrs); } +#endif } else if constexpr(NumDimSpatial == 3 && is_same_v && is_same_v && is_same_v) { +#ifdef CK_ENABLE_FP32 if constexpr(is_same_v && is_same_v && is_same_v) { add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(op_ptrs); } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_FP16 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(op_ptrs); } - else if constexpr(is_same_v && - is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_BF16 + if constexpr(is_same_v && + is_same_v && is_same_v) { add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(op_ptrs); } - else if constexpr(is_same_v && is_same_v && - is_same_v) +#endif +#ifdef CK_ENABLE_INT8 + if constexpr(is_same_v && is_same_v && + is_same_v) { add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instances(op_ptrs); } +#endif } return op_ptrs; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp index 9930b1a6fa..4c22b047f3 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp @@ -2,13 +2,20 @@ // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. #pragma once - #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp" @@ -18,39 +25,10 @@ #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp" @@ -60,17 +38,38 @@ #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp" diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index 1d54a141b7..a0478c9f09 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -1,9 +1,57 @@ function(add_instance_library INSTANCE_NAME) message("adding instance ${INSTANCE_NAME}") - add_library(${INSTANCE_NAME} OBJECT ${ARGN}) - target_compile_features(${INSTANCE_NAME} PUBLIC) - set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) - clang_tidy_check(${INSTANCE_NAME}) + set(result 1) + if(DEFINED DTYPES) + foreach(source IN LISTS ARGN) + set(test 0) + foreach(type IN LISTS DTYPES) + if(type MATCHES "fp16") + set(type1 "_f16") + elseif(type MATCHES "fp32") + set(type1 "_f32") + elseif(type MATCHES "fp8") + set(type1 "_f8") + elseif(type MATCHES "bf16") + set(type1 "_b16") + elseif(type MATCHES "fp64") + set(type1 "_f64") + elseif(type MATCHES "int8") + set(type1 "_i8") + endif() + #make an exception for reduction kernels + if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}" OR "${source}" MATCHES "device_reduce_instance") + #if filename matches any selected type, exit type loop and do no exclude the file from the list + set(test 0) + break() + elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR + source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND + NOT(source MATCHES type OR source MATCHES type1)) + #if filename contains a type which doesn't match any selected type, mark it for removal + set(test 1) + endif() + endforeach() + if(test EQUAL 1) + message("removing instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + endif() + foreach(source IN LISTS ARGN) + if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") + message("removing dl instance ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + #only continue if there are some source files left on the list + if(ARGN) + add_library(${INSTANCE_NAME} OBJECT ${ARGN}) + target_compile_features(${INSTANCE_NAME} PUBLIC) + set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + clang_tidy_check(${INSTANCE_NAME}) + set(result 0) + endif() + #message("add_instance_library returns ${result}") + return(PROPAGATE result) endfunction(add_instance_library INSTANCE_NAME) @@ -15,33 +63,49 @@ IF(IS_DIRECTORY "${subdir_path}") set(cmake_instance) file(READ "${subdir_path}/CMakeLists.txt" cmake_instance) set(add_inst 0) - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp8\" " AND DTYPES MATCHES "fp8") - #message("fp8 instance found!") + if(("${cmake_instance}" MATCHES "_fp8" OR "${cmake_instance}" MATCHES "_f8") AND DTYPES MATCHES "fp8") + message("fp8 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp16\"" AND DTYPES MATCHES "fp16") - #message("fp16 instance found!") + if(("${cmake_instance}" MATCHES "_fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16") + message("fp16 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp32\"" AND DTYPES MATCHES "fp32") - #message("fp32 instance found!") + if(("${cmake_instance}" MATCHES "_fp32" OR "${cmake_instance}" MATCHES "_f32") AND DTYPES MATCHES "fp32") + message("fp32 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"fp64\"" AND DTYPES MATCHES "fp64") - #message("fp64 instance found!") + if(("${cmake_instance}" MATCHES "_fp64" OR "${cmake_instance}" MATCHES "_f64") AND DTYPES MATCHES "fp64") + message("fp64 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"bf16\"" AND DTYPES MATCHES "bf16") - #message("bf16 instance found!") + if("${cmake_instance}" MATCHES "_bf16" AND DTYPES MATCHES "bf16") + message("bf16 instance found!") set(add_inst 1) endif() - if("${cmake_instance}" MATCHES "DTYPES MATCHES \"int8\"" AND DTYPES MATCHES "int8") - #message("int8 instance found!") + if(("${cmake_instance}" MATCHES "_int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8") + message("int8 instance found!") set(add_inst 1) endif() - if(NOT "${cmake_instance}" MATCHES "DTYPES" OR NOT DEFINED DTYPES) - #message("instance should be built for all types!") - set(add_inst 1) + if(NOT "${cmake_instance}" MATCHES "_fp8" OR + NOT "${cmake_instance}" MATCHES "_f8" OR + NOT "${cmake_instance}" MATCHES "_fp16" OR + NOT "${cmake_instance}" MATCHES "_f16" OR + NOT "${cmake_instance}" MATCHES "_fp32" OR + NOT "${cmake_instance}" MATCHES "_f32" OR + NOT "${cmake_instance}" MATCHES "_fp64" OR + NOT "${cmake_instance}" MATCHES "_f64" OR + NOT "${cmake_instance}" MATCHES "_bf16" OR + NOT "${cmake_instance}" MATCHES "_int8" OR + NOT "${cmake_instance}" MATCHES "_i8" OR + NOT "${cmake_instance}" MATCHES "_int4" OR + NOT DEFINED DTYPES) + message("instance should be built for all types!") + set(add_inst 1) + endif() + if("${cmake_instance}" MATCHES "quantization" AND DEFINED DTYPES AND NOT DTYPES MATCHES "int8") + message("quantization instances will not be built!") + set(add_inst 0) endif() if("${cmake_instance}" MATCHES "ONLY DL_KERNELS" AND NOT DEFINED DL_KERNELS) message("Found only dl instances, but DL_KERNELS is not set. Skipping.") diff --git a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt index ec079e3ba3..084714b707 100644 --- a/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/avg_pool3d_bwd/CMakeLists.txt @@ -1,11 +1,5 @@ set(DEVICE_AVGPOOL_BWD_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f16_instance.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f32_instance.cpp) -endif() +list(APPEND DEVICE_AVGPOOL_BWD_INSTANCES device_avg_pool3d_bwd_ndhwc_f16_instance.cpp + device_avg_pool3d_bwd_ndhwc_bf16_instance.cpp + device_avg_pool3d_bwd_ndhwc_f32_instance.cpp) add_instance_library(device_avg_pool3d_bwd_instance ${DEVICE_AVGPOOL_BWD_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt index 5b342595d8..69b6ddc754 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt @@ -1,26 +1,18 @@ set(BATCHED_GEMM_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp +list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp - device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp + device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp + device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp - device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp + device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp + device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp - device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp) -endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp + device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp + device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp) -endif() add_instance_library(device_batched_gemm_instance ${BATCHED_GEMM_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt index 6710035ec6..d0e9b265af 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_add_relu_gemm_add/CMakeLists.txt @@ -1,6 +1,4 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_batched_gemm_add_relu_gemm_add_instance device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp device_batched_gemm_add_relu_gemm_add_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp ) -endif() \ No newline at end of file diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt index b0f37e68fd..cd9c95c066 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_bias_permute/CMakeLists.txt @@ -1,5 +1,4 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_batched_gemm_bias_permute_instance device_batched_gemm_bias_permute_m2_n3_k1_xdl_c_shuffle_f16_f16_f16_f16_instance.cpp ) -endif() + diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt index cdb1a53901..865a31e79a 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_gemm/CMakeLists.txt @@ -1,6 +1,4 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_batched_gemm_gemm_instance device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp device_batched_gemm_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gon_gmo_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt index 444c93b118..b874bc50ee 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_multi_d/CMakeLists.txt @@ -1,25 +1,21 @@ # ONLY DL_KERNELS -if(DL_KERNELS) - set(BATCHED_GEMM_MULTID_INSTANCES) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp) - endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp) - list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp) - endif() - add_instance_library(device_batched_gemm_multi_d_instance ${BATCHED_GEMM_MULTID_INSTANCES}) -endif() +set(BATCHED_GEMM_MULTID_INSTANCES) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gkn_gmn_irregular_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gmk_gnk_gmn_irregular_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gkn_gmn_irregular_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_f16_f16_f16_gkm_gnk_gmn_irregular_instance.cpp) + +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gkn_gmn_irregular_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gmk_gnk_gmn_irregular_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gkn_gmn_irregular_instance.cpp) +list(APPEND BATCHED_GEMM_MULTID_INSTANCES device_batched_gemm_multi_d_dl_i8_i8_i8_gkm_gnk_gmn_irregular_instance.cpp) + +add_instance_library(device_batched_gemm_multi_d_instance ${BATCHED_GEMM_MULTID_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt index 728e35fc3b..28226fabac 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt @@ -1,8 +1,6 @@ -if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) add_instance_library(device_batched_gemm_reduce_instance device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt index 5ac55655d4..6244477e16 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/CMakeLists.txt @@ -1,5 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_batched_gemm_softmax_gemm_instance device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt index d5110e4505..3fd4e03449 100644 --- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/CMakeLists.txt @@ -1,11 +1,7 @@ set(DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp) - list(APPEND DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp) - list(APPEND DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp) -endif() +list(APPEND DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES + device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp + device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp + device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp + device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp) add_instance_library(device_batched_gemm_softmax_gemm_permute_instance ${DEVICE_BATCHED_GEMM_SOFTMAX_GEMM_PERMUTE_INSTANCES}) - diff --git a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt index 1db6985f61..ee9c419e1a 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/contraction_bilinear/CMakeLists.txt @@ -1,17 +1,14 @@ set(DEVICE_CONTRACTION_BILINEAR_INSTANCES) -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - #float - list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp) -endif() -if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - #double - list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp - device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp) -endif() -add_instance_library(device_contraction_bilinear_instance ${DEVICE_CONTRACTION_BILINEAR_INSTANCES}) +#float +list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_kknn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_knnn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mknn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_f32_mnnn_instance.cpp) +#double +list(APPEND DEVICE_CONTRACTION_BILINEAR_INSTANCES device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_kknn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_knnn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mknn_instance.cpp + device_contraction_bilinear_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_f64_mnnn_instance.cpp) + +add_instance_library(device_contraction_bilinear_instance ${DEVICE_CONTRACTION_BILINEAR_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt index aed8bef2a8..673fae3373 100644 --- a/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/contraction_scale/CMakeLists.txt @@ -1,17 +1,15 @@ set(DEVICE_CONTRACTION_SCALE_INSTANCES) -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - #float - list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp) -endif() -if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - #double - list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp - device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp) -endif() +#float +list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_kkn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_knn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mkn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f32_f32_f32_mnn_instance.cpp) + +#double +list(APPEND DEVICE_CONTRACTION_SCALE_INSTANCES device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_kkn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_knn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mkn_instance.cpp + device_contraction_scale_m2_n2_k2_xdl_c_shuffle_f64_f64_f64_mnn_instance.cpp) + add_instance_library(device_contraction_scale_instance ${DEVICE_CONTRACTION_SCALE_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt index 54ef9cc7a6..49dfc01fd9 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_bwd_data/CMakeLists.txt @@ -1,23 +1,10 @@ set(CONV2D_BWD_DATA_INSTANCES) -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp) - if(DL_KERNELS) - list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp) - endif() -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp) -endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp) - if(DL_KERNELS) - list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp) - endif() -endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp) - if(DL_KERNELS) - list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp) - endif() -endif() +list(APPEND CONV2D_BWD_DATA_INSTANCES device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instance.cpp + device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f32_instance.cpp + device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp + device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instance.cpp + device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_f16_instance.cpp + device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instance.cpp + device_conv2d_bwd_data_dl_nhwc_kyxc_nhwk_int8_instance.cpp) + add_instance_library(device_conv2d_bwd_data_instance ${CONV2D_BWD_DATA_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt index 96ecc95658..ba0ca32517 100644 --- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/CMakeLists.txt @@ -1,16 +1,8 @@ set(DEVICE_CONV2D_FWD_INSTANCES) -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_CONV2D_FWD_INSTANCES device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp) -endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_CONV2D_FWD_INSTANCES device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp) - list(APPEND DEVICE_CONV2D_FWD_INSTANCES device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_CONV2D_FWD_INSTANCES device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp) -endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_CONV2D_FWD_INSTANCES device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp) -endif() - +list(APPEND DEVICE_CONV2D_FWD_INSTANCES device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instance.cpp + device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instance.cpp + device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp + device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instance.cpp + device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instance.cpp) + add_instance_library(device_conv2d_fwd_instance ${DEVICE_CONV2D_FWD_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt index ea53b82be5..0c7cc2cd31 100644 --- a/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/elementwise_normalization/CMakeLists.txt @@ -1,5 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_elementwise_normalization_instance device_elementwise_normalization_f16_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt index 6c76ee843a..22cf1a462d 100644 --- a/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm/CMakeLists.txt @@ -1,113 +1,99 @@ set(GEMM_INSTANCES) -if(DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp) - if(DL_KERNELS) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp) - endif() -endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - if(DL_KERNELS) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp) - endif() - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp) -endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - if(DL_KERNELS) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp) - endif() - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp) - list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp) -endif() +list(APPEND GEMM_INSTANCES device_gemm_xdl_f64_f64_f64_mk_kn_mn_instance.cpp + device_gemm_xdl_f64_f64_f64_mk_nk_mn_instance.cpp + device_gemm_xdl_f64_f64_f64_km_kn_mn_instance.cpp + device_gemm_xdl_f64_f64_f64_km_nk_mn_instance.cpp) + +list(APPEND GEMM_INSTANCES device_gemm_xdl_f32_f32_f32_mk_kn_mn_instance.cpp + device_gemm_xdl_f32_f32_f32_mk_nk_mn_instance.cpp + device_gemm_xdl_f32_f32_f32_km_kn_mn_instance.cpp + device_gemm_xdl_f32_f32_f32_km_nk_mn_instance.cpp + device_gemm_xdl_c_shuffle_f32_f32_f32_mk_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_f32_f32_f32_mk_nk_mn_instance.cpp + device_gemm_xdl_c_shuffle_f32_f32_f32_km_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_f32_f32_f32_km_nk_mn_instance.cpp + device_gemm_dl_f32_f32_f32_mk_kn_mn_instance.cpp + device_gemm_dl_f32_f32_f32_mk_nk_mn_instance.cpp + device_gemm_dl_f32_f32_f32_km_kn_mn_instance.cpp + device_gemm_dl_f32_f32_f32_km_nk_mn_instance.cpp) + +list(APPEND GEMM_INSTANCES device_gemm_dl_f16_f16_f16_mk_kn_mn_instance.cpp + device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instance.cpp + device_gemm_dl_f16_f16_f16_mk_nk_mn_instance.cpp + device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instance.cpp + device_gemm_dl_f16_f16_f16_km_kn_mn_instance.cpp + device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instance.cpp + device_gemm_dl_f16_f16_f16_km_nk_mn_instance.cpp + device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instance.cpp + device_gemm_dpp_f16_f16_f16_km_kn_mn_instance.cpp + device_gemm_dpp_f16_f16_f16_km_nk_mn_instance.cpp + device_gemm_dpp_f16_f16_f16_mk_kn_mn_instance.cpp + device_gemm_dpp_f16_f16_f16_mk_nk_mn_instance.cpp + device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instance.cpp + device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instance.cpp + device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instance.cpp + device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instance.cpp + device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp + device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp + device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_add_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_default_pipeline_v2_opt_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_add_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_default_pipeline_v2_opt_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_add_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_default_pipeline_v2_opt_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_add_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v1_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_default_pipeline_v2_instance.cpp + device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp) + +list(APPEND GEMM_INSTANCES device_gemm_dl_i8_i8_i8_mk_kn_mn_instance.cpp + device_gemm_dl_i8_i8_i8_mk_kn_mn_irregular_instance.cpp + device_gemm_dl_i8_i8_i8_mk_nk_mn_instance.cpp + device_gemm_dl_i8_i8_i8_mk_nk_mn_irregular_instance.cpp + device_gemm_dl_i8_i8_i8_km_kn_mn_instance.cpp + device_gemm_dl_i8_i8_i8_km_kn_mn_irregular_instance.cpp + device_gemm_dl_i8_i8_i8_km_nk_mn_instance.cpp + device_gemm_dl_i8_i8_i8_km_nk_mn_irregular_instance.cpp + device_gemm_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp + device_gemm_xdl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp) + +list(APPEND GEMM_INSTANCES device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp + device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_kn_mn_instance.cpp + device_gemm_xdl_c_shuffle_bf16_bf16_bf16_km_nk_mn_instance.cpp) add_instance_library(device_gemm_instance ${GEMM_INSTANCES}) +set(ENABLE_PIPELINE_V2_OPT OFF) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - set(ENABLE_PIPELINE_V2_OPT OFF) - - if (ENABLE_PIPELINE_V2_OPT) +if (ENABLE_PIPELINE_V2_OPT) set(MAX_ILP_OPTS -mllvm -amdgpu-enable-max-ilp-scheduling-strategy @@ -137,5 +123,5 @@ if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) set_source_files_properties(device_gemm_xdl_f16_f16_f16/mk_nk_mn_default_pipeline_v2_opt_instance.cpp PROPERTIES COMPILE_OPTIONS "${MAX_ILP_OPTS}" COMPILE_DEFINITIONS "${WAVES_PER_EU_DEFS};${IGLP_OPT_DEFS}") - endif(ENABLE_PIPELINE_V2_OPT) -endif(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) +endif(ENABLE_PIPELINE_V2_OPT) + diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt index 9028829fe9..bbf81a5fa2 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/CMakeLists.txt @@ -1,8 +1,6 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_gemm_add_add_fastgelu_instance device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt index 1085966807..0beb10e379 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/CMakeLists.txt @@ -1,8 +1,6 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_gemm_add_fastgelu_instance device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt index 6079f90130..97693a2566 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/CMakeLists.txt @@ -1,8 +1,6 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_gemm_add_relu_add_layernorm_instance device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt index aef8fe86dd..426edeed74 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_bilinear/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_gemm_bilinear_instance device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp device_gemm_bilinear_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp @@ -9,4 +8,3 @@ add_instance_library(device_gemm_bilinear_instance device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_kn_mn_mn_instance.cpp device_gemm_bilinear_wmma_c_shuffle_i8_i8_i8_i8_mk_nk_mn_mn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt index 772373dcb0..17d27ab150 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/CMakeLists.txt @@ -1,8 +1,6 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_gemm_fastgelu_instance device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt index 36bd6a4aa3..6cbd7528e7 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_add/CMakeLists.txt @@ -1,13 +1,6 @@ set(GEMM_MULTIPLY_ADD_INSTANCES) - -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND GEMM_MULTIPLY_ADD_INSTANCES device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp) - list(APPEND GEMM_MULTIPLY_ADD_INSTANCES device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp) -endif() - -if((DTYPES MATCHES "fp16" AND DTYPES MATCHES "fp8") OR NOT DEFINED DTYPES) - list(APPEND GEMM_MULTIPLY_ADD_INSTANCES device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp) - list(APPEND GEMM_MULTIPLY_ADD_INSTANCES device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp) -endif() - +list(APPEND GEMM_MULTIPLY_ADD_INSTANCES device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp + device_gemm_multiply_add_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp + device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_kn_mn_mn_mn_instance.cpp + device_gemm_multiply_add_xdl_c_shuffle_f16_f8_f32_f32_f16_mk_nk_mn_mn_mn_instance.cpp) add_instance_library(device_gemm_multiply_add_instance ${GEMM_MULTIPLY_ADD_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt index 043b28a1be..837e040eac 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/CMakeLists.txt @@ -1,28 +1,20 @@ set(GEMM_SPLITK_INSTANCES) -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp) -endif() - -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp) -endif() - -if((DTYPES MATCHES "fp16" AND DTYPES MATCHES "fp8") OR NOT DEFINED DTYPES) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f8_f16_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f8_f16_f16_km_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f8_f16_f16_km_nk_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f8_f16_mk_nk_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f8_f16_km_kn_mn_instance.cpp) - list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f16_f8_f16_km_nk_mn_instance.cpp) -endif() +list(APPEND GEMM_SPLITK_INSTANCES device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instance.cpp + device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instance.cpp + device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instance.cpp + device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instance.cpp + device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_instance.cpp + device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_instance.cpp + device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instance.cpp + device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instance.cpp + device_gemm_xdl_splitk_f8_f16_f16_mk_kn_mn_instance.cpp + device_gemm_xdl_splitk_f8_f16_f16_mk_nk_mn_instance.cpp + device_gemm_xdl_splitk_f8_f16_f16_km_kn_mn_instance.cpp + device_gemm_xdl_splitk_f8_f16_f16_km_nk_mn_instance.cpp + device_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_instance.cpp + device_gemm_xdl_splitk_f16_f8_f16_mk_nk_mn_instance.cpp + device_gemm_xdl_splitk_f16_f8_f16_km_kn_mn_instance.cpp + device_gemm_xdl_splitk_f16_f8_f16_km_nk_mn_instance.cpp) add_instance_library(device_gemm_splitk_instance ${GEMM_SPLITK_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt index d9baf3f062..8dd0112a6b 100644 --- a/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/gemm_streamk/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_gemm_streamk_instance # device_gemm_xdl_streamk_f32_f32_f32_mk_kn_mn_instance.cpp # device_gemm_xdl_streamk_f32_f32_f32_mk_nk_mn_instance.cpp @@ -9,4 +8,3 @@ add_instance_library(device_gemm_streamk_instance # device_gemm_xdl_streamk_f16_f16_f16_km_kn_mn_instance.cpp # device_gemm_xdl_streamk_f16_f16_f16_km_nk_mn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt index f1553e3d5b..b973b70aac 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_grouped_gemm_instance device_grouped_gemm_xdl_f16_f16_f16_mk_kn_mn_instance.cpp device_grouped_gemm_xdl_f16_f16_f16_mk_nk_mn_instance.cpp @@ -9,4 +8,3 @@ add_instance_library(device_grouped_gemm_instance device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instance.cpp device_grouped_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_irregular_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt index a45bf36399..648f2146cb 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fastgelu/CMakeLists.txt @@ -1,8 +1,6 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_instance_library(device_grouped_gemm_fastgelu_instance device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_kn_mn_instance.cpp device_grouped_gemm_fastgelu_xdl_f16_f16_f16_mk_nk_mn_instance.cpp device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_kn_mn_instance.cpp device_grouped_gemm_fastgelu_xdl_f16_f16_f16_km_nk_mn_instance.cpp ) -endif() diff --git a/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt index 820a63480e..d12864398e 100644 --- a/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/max_pool_bwd/CMakeLists.txt @@ -1,11 +1,5 @@ set(DEVICE_MAXPOOL_BWD_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_f16_instance.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_bf16_instance.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_f32_instance.cpp) -endif() +list(APPEND DEVICE_MAXPOOL_BWD_INSTANCES device_max_pool_bwd_f16_instance.cpp + device_max_pool_bwd_bf16_instance.cpp + device_max_pool_bwd_f32_instance.cpp) add_instance_library(device_max_pool_bwd_instance ${DEVICE_MAXPOOL_BWD_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt index 4892a9c183..19818a523b 100644 --- a/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/normalization/CMakeLists.txt @@ -1,15 +1,14 @@ set(DEVICE_NORMALIZATION_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_NORMALIZATION_INSTANCES device_layernorm2d_f16_instance.cpp + +list(APPEND DEVICE_NORMALIZATION_INSTANCES + device_layernorm2d_f16_instance.cpp device_layernorm4d_f16_instance.cpp device_groupnorm_f16_instance.cpp device_groupnorm_swish_f16_instance.cpp - device_groupnorm_swish_f16_f32_f32_f16_instance.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_NORMALIZATION_INSTANCES device_layernorm2d_f32_instance.cpp + device_groupnorm_swish_f16_f32_f32_f16_instance.cpp + device_layernorm2d_f32_instance.cpp device_layernorm4d_f32_instance.cpp device_groupnorm_f32_instance.cpp device_groupnorm_swish_f32_instance.cpp) -endif() + add_instance_library(device_normalization_instance ${DEVICE_NORMALIZATION_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt index 63bbe75465..533ce89dfd 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt @@ -1,14 +1,8 @@ set(DEVICE_POOL3D_FWD_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp - device_max_pool3d_fwd_ndhwc_f16_instance.cpp) -endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp - device_max_pool3d_fwd_ndhwc_bf16_instance.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f32_instance.cpp - device_max_pool3d_fwd_ndhwc_f32_instance.cpp) -endif() +list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp + device_max_pool3d_fwd_ndhwc_f16_instance.cpp + device_avg_pool3d_fwd_ndhwc_f32_instance.cpp + device_max_pool3d_fwd_ndhwc_f32_instance.cpp + device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp + device_max_pool3d_fwd_ndhwc_bf16_instance.cpp) add_instance_library(device_pool3d_fwd_instance ${DEVICE_POOL3D_FWD_INSTANCES}) diff --git a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt index 00eb6ff1c1..c22a6e9e96 100644 --- a/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/quantization/CMakeLists.txt @@ -1,5 +1,3 @@ -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - set(CONV2D_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_xdl_perlayer_quantization_int8_instance.cpp) set(CONV2D_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_xdl_perchannel_quantization_int8_instance.cpp) set(CONV2D_BIAS_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_xdl_bias_perlayer_quantization_int8_instance.cpp) @@ -10,17 +8,16 @@ set(GEMM_QUANT_SRC gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp gemm/device_gemm_quantization_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp ) -if(DL_KERNELS) - list(APPEND CONV2D_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp) - list(APPEND CONV2D_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp) - list(APPEND CONV2D_BIAS_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp) - list(APPEND CONV2D_BIAS_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp) - list(APPEND GEMM_QUANT_SRC - gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp - gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp - gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp - gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp) -endif() + +list(APPEND CONV2D_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_perlayer_quantization_int8_instance.cpp) +list(APPEND CONV2D_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_perchannel_quantization_int8_instance.cpp) +list(APPEND CONV2D_BIAS_PERLAYER_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perlayer_quantization_int8_instance.cpp) +list(APPEND CONV2D_BIAS_PERCHANNEL_QUANT_SRC conv2d_fwd/device_conv2d_dl_bias_perchannel_quantization_int8_instance.cpp) +list(APPEND GEMM_QUANT_SRC + gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_kn_mn_instance.cpp + gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_km_nk_mn_instance.cpp + gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_kn_mn_instance.cpp + gemm/device_gemm_quantization_dl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp) add_instance_library(device_quantization_instance ${CONV2D_PERLAYER_QUANT_SRC} @@ -29,4 +26,3 @@ add_instance_library(device_quantization_instance ${CONV2D_BIAS_PERCHANNEL_QUANT_SRC} ${GEMM_QUANT_SRC} ) -endif() \ No newline at end of file diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt index ba0197477f..dbe3764115 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt @@ -1,20 +1,17 @@ set(DEVICE_SOFTMAX_INSTANCES) -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_SOFTMAX_INSTANCES device_softmax_f16_f16_instance_rank3_reduce1.cpp +list(APPEND DEVICE_SOFTMAX_INSTANCES + device_softmax_f16_f16_instance_rank3_reduce1.cpp device_softmax_f16_f16_instance_rank3_reduce2.cpp device_softmax_f16_f16_instance_rank3_reduce3.cpp device_softmax_f16_f16_instance_rank4_reduce1.cpp device_softmax_f16_f16_instance_rank4_reduce2.cpp device_softmax_f16_f16_instance_rank4_reduce3.cpp - device_softmax_f16_f16_instance_rank4_reduce4.cpp) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - list(APPEND DEVICE_SOFTMAX_INSTANCES device_softmax_f32_f32_instance_rank3_reduce1.cpp + device_softmax_f16_f16_instance_rank4_reduce4.cpp + device_softmax_f32_f32_instance_rank3_reduce1.cpp device_softmax_f32_f32_instance_rank3_reduce2.cpp device_softmax_f32_f32_instance_rank3_reduce3.cpp device_softmax_f32_f32_instance_rank4_reduce1.cpp device_softmax_f32_f32_instance_rank4_reduce2.cpp device_softmax_f32_f32_instance_rank4_reduce3.cpp device_softmax_f32_f32_instance_rank4_reduce4.cpp) -endif() add_instance_library(device_softmax_instance ${DEVICE_SOFTMAX_INSTANCES}) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8fddd60858..368965aa8a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -9,26 +9,121 @@ add_custom_target(tests) function(add_test_executable TEST_NAME) message("adding test ${TEST_NAME}") - add_executable(${TEST_NAME} ${ARGN}) - add_test(NAME ${TEST_NAME} COMMAND $) - add_dependencies(tests ${TEST_NAME}) - add_dependencies(check ${TEST_NAME}) - rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) + set(result 1) + if(DEFINED DTYPES) + foreach(source IN LISTS ARGN) + set(test 0) + foreach(type IN LISTS DTYPES) + if(type MATCHES "fp16") + set(type1 "_f16") + elseif(type MATCHES "fp32") + set(type1 "_f32") + elseif(type MATCHES "fp8") + set(type1 "_f8") + elseif(type MATCHES "bf16") + set(type1 "_b16") + elseif(type MATCHES "fp64") + set(type1 "_f64") + elseif(type MATCHES "int8") + set(type1 "_i8") + endif() + if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}") + #if filename matches any selected type, exit type loop and do no exclude the file from the list + set(test 0) + break() + elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR + source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND + NOT(source MATCHES type OR source MATCHES type1)) + #if filename contains a type which doesn't match any selected type, mark it for removal + set(test 1) + endif() + endforeach() + if(test EQUAL 1) + message("removing test ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + endif() + foreach(source IN LISTS ARGN) + if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") + message("removing dl test ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + + #only continue if there are some source files left on the list + if(ARGN) + add_executable(${TEST_NAME} ${ARGN}) + add_test(NAME ${TEST_NAME} COMMAND $) + add_dependencies(tests ${TEST_NAME}) + add_dependencies(check ${TEST_NAME}) + rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) + set(result 0) + endif() + #message("add_test returns ${result}") + return(PROPAGATE result) endfunction(add_test_executable TEST_NAME) include(GoogleTest) function(add_gtest_executable TEST_NAME) message("adding gtest ${TEST_NAME}") - add_executable(${TEST_NAME} ${ARGN}) - add_dependencies(tests ${TEST_NAME}) - add_dependencies(check ${TEST_NAME}) + set(result 1) + if(DEFINED DTYPES) + foreach(source IN LISTS ARGN) + set(test 0) + foreach(type IN LISTS DTYPES) + if(type MATCHES "fp16") + set(type1 "_f16") + elseif(type MATCHES "fp32") + set(type1 "_f32") + elseif(type MATCHES "fp8") + set(type1 "_f8") + elseif(type MATCHES "bf16") + set(type1 "_b16") + elseif(type MATCHES "fp64") + set(type1 "_f64") + elseif(type MATCHES "int8") + set(type1 "_i8") + endif() + if("${source}" MATCHES "${type}" OR "${source}" MATCHES "${type1}") + #if filename matches any selected type, exit type loop and do no exclude the file from the list + set(test 0) + break() + elseif((source MATCHES "fp8" OR source MATCHES "fp32" OR source MATCHES "fp64" OR source MATCHES "bf16" OR source MATCHES "int8" OR source MATCHES "fp16" OR + source MATCHES "_f8" OR source MATCHES "_f32" OR source MATCHES "_f64" OR source MATCHES "_i8" OR source MATCHES "_f16" OR source MATCHES "_b16") AND + NOT(source MATCHES type OR source MATCHES type1)) + #if filename contains a type which doesn't match any selected type, mark it for removal + set(test 1) + endif() + endforeach() + if(test EQUAL 1) + message("removing gtest ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + endif() + foreach(source IN LISTS ARGN) + if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") + message("removing dl test ${source} ") + list(REMOVE_ITEM ARGN "${source}") + endif() + endforeach() + #only continue if there are some source files left on the list + if(ARGN) + add_executable(${TEST_NAME} ${ARGN}) + add_dependencies(tests ${TEST_NAME}) + add_dependencies(check ${TEST_NAME}) - # suppress gtest warnings - target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef) - target_link_libraries(${TEST_NAME} PRIVATE gtest_main) - add_test(NAME ${TEST_NAME} COMMAND $) - rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) + # suppress gtest warnings + target_compile_options(${TEST_NAME} PRIVATE -Wno-global-constructors -Wno-undef) + target_link_libraries(${TEST_NAME} PRIVATE gtest_main) + add_test(NAME ${TEST_NAME} COMMAND $) + rocm_install(TARGETS ${TEST_NAME} COMPONENT tests) + set(result 0) + endif() + #message("add_gtest returns ${result}") + return(PROPAGATE result) endfunction(add_gtest_executable TEST_NAME) add_subdirectory(magic_number_division) diff --git a/test/batched_gemm/CMakeLists.txt b/test/batched_gemm/CMakeLists.txt index 8a3269e90f..d169c135ca 100644 --- a/test/batched_gemm/CMakeLists.txt +++ b/test/batched_gemm/CMakeLists.txt @@ -2,25 +2,21 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp) - target_link_libraries(test_batched_gemm_fp16 PRIVATE utility) - target_link_libraries(test_batched_gemm_fp16 PRIVATE device_batched_gemm_instance) + add_test_executable(test_batched_gemm_fp16 batched_gemm_fp16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_fp16 PRIVATE utility device_batched_gemm_instance) endif() - if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp) - target_link_libraries(test_batched_gemm_fp32 PRIVATE utility) - target_link_libraries(test_batched_gemm_fp32 PRIVATE device_batched_gemm_instance) + add_test_executable(test_batched_gemm_fp32 batched_gemm_fp32.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_fp32 PRIVATE utility device_batched_gemm_instance) endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp) - target_link_libraries(test_batched_gemm_bf16 PRIVATE utility) - target_link_libraries(test_batched_gemm_bf16 PRIVATE device_batched_gemm_instance) + add_test_executable(test_batched_gemm_bf16 batched_gemm_bf16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_bf16 PRIVATE utility device_batched_gemm_instance) endif() - if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) - add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp) - target_link_libraries(test_batched_gemm_int8 PRIVATE utility) - target_link_libraries(test_batched_gemm_int8 PRIVATE device_batched_gemm_instance) + add_test_executable(test_batched_gemm_int8 batched_gemm_int8.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_int8 PRIVATE utility device_batched_gemm_instance) endif() set(target 1) endif() diff --git a/test/batched_gemm_gemm/CMakeLists.txt b/test/batched_gemm_gemm/CMakeLists.txt index 404e74f3bc..03f1d3a4eb 100644 --- a/test/batched_gemm_gemm/CMakeLists.txt +++ b/test/batched_gemm_gemm/CMakeLists.txt @@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_custom_target(test_batched_gemm_gemm) - add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp) + add_custom_target(test_batched_gemm_gemm) + add_gtest_executable(test_batched_gemm_gemm_fp16 test_batched_gemm_gemm_fp16.cpp) + if(result EQUAL 0) target_link_libraries(test_batched_gemm_gemm_fp16 PRIVATE utility device_batched_gemm_gemm_instance) add_dependencies(test_batched_gemm_gemm test_batched_gemm_gemm_fp16) set(target 1) - endif() + endif() endif() endforeach() \ No newline at end of file diff --git a/test/batched_gemm_multi_d/CMakeLists.txt b/test/batched_gemm_multi_d/CMakeLists.txt index 825f0dd235..d5e4c4fbe8 100644 --- a/test/batched_gemm_multi_d/CMakeLists.txt +++ b/test/batched_gemm_multi_d/CMakeLists.txt @@ -1,4 +1,4 @@ -if(DL_KERNELS) - add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d.cpp) +add_gtest_executable(test_batched_gemm_multi_d test_batched_gemm_multi_d_dl.cpp) +if(result EQUAL 0) target_link_libraries(test_batched_gemm_multi_d PRIVATE utility device_batched_gemm_multi_d_instance) endif() diff --git a/test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp b/test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp similarity index 100% rename from test/batched_gemm_multi_d/test_batched_gemm_multi_d.cpp rename to test/batched_gemm_multi_d/test_batched_gemm_multi_d_dl.cpp diff --git a/test/batched_gemm_reduce/CMakeLists.txt b/test/batched_gemm_reduce/CMakeLists.txt index af95a50eab..32c6ee85d1 100644 --- a/test/batched_gemm_reduce/CMakeLists.txt +++ b/test/batched_gemm_reduce/CMakeLists.txt @@ -2,10 +2,9 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp) - target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility) - target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE device_batched_gemm_reduce_instance) + add_test_executable(test_batched_gemm_reduce_fp16 batched_gemm_reduce_fp16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_reduce_fp16 PRIVATE utility device_batched_gemm_reduce_instance) set(target 1) endif() endif() diff --git a/test/batched_gemm_softmax_gemm/CMakeLists.txt b/test/batched_gemm_softmax_gemm/CMakeLists.txt index c49175a2eb..c011a6a3c5 100644 --- a/test/batched_gemm_softmax_gemm/CMakeLists.txt +++ b/test/batched_gemm_softmax_gemm/CMakeLists.txt @@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_custom_target(test_batched_gemm_softmax_gemm) - add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp) - target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance) - add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16) - set(target 1) - endif() + add_custom_target(test_batched_gemm_softmax_gemm) + add_gtest_executable(test_batched_gemm_softmax_gemm_fp16 test_batched_gemm_softmax_gemm_fp16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_softmax_gemm_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_instance) + add_dependencies(test_batched_gemm_softmax_gemm test_batched_gemm_softmax_gemm_fp16) + set(target 1) + endif() endif() endforeach() \ No newline at end of file diff --git a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt index ae28daa80e..3164863eef 100644 --- a/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt +++ b/test/batched_gemm_softmax_gemm_permute/CMakeLists.txt @@ -2,25 +2,28 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_custom_target(test_batched_gemm_softmax_gemm_permute) - endif() - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp) - add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp) - target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) - target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) - add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16) - add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16) - endif() - if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) - add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp) - add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp) - target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) - target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) - add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16) - add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16) - endif() + add_custom_target(test_batched_gemm_softmax_gemm_permute) + add_gtest_executable(test_batched_gemm_softmax_gemm_permute_fp16 test_batched_gemm_softmax_gemm_permute_fp16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) + add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_fp16) + endif() + add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_fp16 test_batched_gemm_bias_softmax_gemm_permute_fp16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_fp16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) + add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_fp16) + endif() + + add_gtest_executable(test_batched_gemm_softmax_gemm_permute_bf16 test_batched_gemm_softmax_gemm_permute_bf16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) + add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_softmax_gemm_permute_bf16) + endif() + add_gtest_executable(test_batched_gemm_bias_softmax_gemm_permute_bf16 test_batched_gemm_bias_softmax_gemm_permute_bf16.cpp) + if(result EQUAL 0) + target_link_libraries(test_batched_gemm_bias_softmax_gemm_permute_bf16 PRIVATE utility device_batched_gemm_softmax_gemm_permute_instance) + add_dependencies(test_batched_gemm_softmax_gemm_permute test_batched_gemm_bias_softmax_gemm_permute_bf16) + endif() set(target 1) endif() endforeach() \ No newline at end of file diff --git a/test/data_type/CMakeLists.txt b/test/data_type/CMakeLists.txt index baf0174556..b1606d2a75 100644 --- a/test/data_type/CMakeLists.txt +++ b/test/data_type/CMakeLists.txt @@ -1,14 +1,15 @@ if (USE_BITINT_EXTENSION_INT4) add_gtest_executable(test_int4 int4.cpp) - target_link_libraries(test_int4 PRIVATE utility) + if(result EQUAL 0) + target_link_libraries(test_int4 PRIVATE utility) + endif() endif() -if(DTYPES MATCHES "fp8" OR NOT DEFINED DTYPES) - add_gtest_executable(test_f8 f8.cpp) - target_link_libraries(test_f8 PRIVATE utility) +add_gtest_executable(test_fp8 fp8.cpp) +if(result EQUAL 0) + target_link_libraries(test_fp8 PRIVATE utility) endif() - -if(DTYPES MATCHES "bf8" OR NOT DEFINED DTYPES) - add_gtest_executable(test_bf8 bf8.cpp) +add_gtest_executable(test_bf8 bf8.cpp) +if(result EQUAL 0) target_link_libraries(test_bf8 PRIVATE utility) endif() diff --git a/test/data_type/f8.cpp b/test/data_type/fp8.cpp similarity index 100% rename from test/data_type/f8.cpp rename to test/data_type/fp8.cpp diff --git a/test/elementwise_normalization/CMakeLists.txt b/test/elementwise_normalization/CMakeLists.txt index 74a3e4999e..aed67901b5 100644 --- a/test/elementwise_normalization/CMakeLists.txt +++ b/test/elementwise_normalization/CMakeLists.txt @@ -1,6 +1,6 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_custom_target(test_elementwise_normalization) - add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp) +add_custom_target(test_elementwise_normalization) +add_gtest_executable(test_elementwise_layernorm_fp16 test_elementwise_layernorm_fp16.cpp) +if(result EQUAL 0) target_link_libraries(test_elementwise_layernorm_fp16 PRIVATE utility device_elementwise_normalization_instance) add_dependencies(test_elementwise_normalization test_elementwise_layernorm_fp16) endif() \ No newline at end of file diff --git a/test/gemm/CMakeLists.txt b/test/gemm/CMakeLists.txt index 7a8836bfef..f88a134041 100644 --- a/test/gemm/CMakeLists.txt +++ b/test/gemm/CMakeLists.txt @@ -1,30 +1,28 @@ -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) add_test_executable(test_gemm_fp32 gemm_fp32.cpp) -target_link_libraries(test_gemm_fp32 PRIVATE utility) -target_link_libraries(test_gemm_fp32 PRIVATE device_gemm_instance) +if(result EQUAL 0) + target_link_libraries(test_gemm_fp32 PRIVATE utility device_gemm_instance) endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_test_executable(test_gemm_fp16 gemm_fp16.cpp) -target_link_libraries(test_gemm_fp16 PRIVATE utility) -target_link_libraries(test_gemm_fp16 PRIVATE device_gemm_instance) -add_library(gemm_standalone_xdl_fp16_instances STATIC +if(result EQUAL 0) + target_link_libraries(test_gemm_fp16 PRIVATE utility device_gemm_instance) + add_library(gemm_standalone_xdl_fp16_instances STATIC instance/gemm_f16_nn_instance.cpp instance/gemm_f16_nt_instance.cpp instance/gemm_f16_tn_instance.cpp instance/gemm_wavelet_f16_tn_instance.cpp instance/gemm_f16_tt_instance.cpp -) + ) +endif() add_test_executable(test_gemm_standalone_xdl_fp16 gemm_standalone_xdl_fp16.cpp) -target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility) -target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/) +if(result EQUAL 0) + target_link_libraries(test_gemm_standalone_xdl_fp16 PRIVATE gemm_standalone_xdl_fp16_instances utility) + target_include_directories(test_gemm_standalone_xdl_fp16 PRIVATE instance/) endif() -if(DTYPES MATCHES "bf16" OR NOT DEFINED DTYPES) add_test_executable(test_gemm_bf16 gemm_bf16.cpp) -target_link_libraries(test_gemm_bf16 PRIVATE utility) -target_link_libraries(test_gemm_bf16 PRIVATE device_gemm_instance) +if(result EQUAL 0) + target_link_libraries(test_gemm_bf16 PRIVATE utility device_gemm_instance) endif() -if(DTYPES MATCHES "int8" OR NOT DEFINED DTYPES) add_test_executable(test_gemm_int8 gemm_int8.cpp) -target_link_libraries(test_gemm_int8 PRIVATE utility) -target_link_libraries(test_gemm_int8 PRIVATE device_gemm_instance) +if(result EQUAL 0) + target_link_libraries(test_gemm_int8 PRIVATE utility device_gemm_instance) endif() \ No newline at end of file diff --git a/test/gemm_layernorm/CMakeLists.txt b/test/gemm_layernorm/CMakeLists.txt index ba0a99b67f..bfc4404bd8 100644 --- a/test/gemm_layernorm/CMakeLists.txt +++ b/test/gemm_layernorm/CMakeLists.txt @@ -2,12 +2,12 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) add_custom_target(test_gemm_layernorm) add_gtest_executable(test_gemm_add_relu_add_layernorm_fp16 test_gemm_add_relu_add_layernorm_fp16.cpp) - target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance) - add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16) - set(target 1) - endif() + if(result EQUAL 0) + target_link_libraries(test_gemm_add_relu_add_layernorm_fp16 PRIVATE utility device_gemm_add_relu_add_layernorm_instance) + add_dependencies(test_gemm_layernorm test_gemm_add_relu_add_layernorm_fp16) + set(target 1) + endif() endif() endforeach() diff --git a/test/gemm_reduce/CMakeLists.txt b/test/gemm_reduce/CMakeLists.txt index 43c8d60745..42a53c3048 100644 --- a/test/gemm_reduce/CMakeLists.txt +++ b/test/gemm_reduce/CMakeLists.txt @@ -1,5 +1,4 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp) - target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility) - target_link_libraries(test_gemm_reduce_fp16 PRIVATE device_gemm_reduce_instance) +add_test_executable(test_gemm_reduce_fp16 gemm_reduce_fp16.cpp) +if(result EQUAL 0) + target_link_libraries(test_gemm_reduce_fp16 PRIVATE utility device_gemm_reduce_instance) endif() \ No newline at end of file diff --git a/test/grouped_gemm/CMakeLists.txt b/test/grouped_gemm/CMakeLists.txt index 476d953ed8..8c57b667e2 100644 --- a/test/grouped_gemm/CMakeLists.txt +++ b/test/grouped_gemm/CMakeLists.txt @@ -1,4 +1,3 @@ -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) @@ -13,4 +12,3 @@ foreach(gpu IN LISTS GPU_TARGETS) set(target 1) endif() endforeach() -endif() diff --git a/test/normalization/CMakeLists.txt b/test/normalization/CMakeLists.txt index 2beda4dd74..fe4b1ad82e 100644 --- a/test/normalization/CMakeLists.txt +++ b/test/normalization/CMakeLists.txt @@ -1,19 +1,21 @@ -if(DTYPES MATCHES "fp16" OR DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_custom_target(test_normalization) -endif() -if(DTYPES MATCHES "fp32" OR NOT DEFINED DTYPES) - add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp) - add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) +add_custom_target(test_normalization) +add_gtest_executable(test_layernorm2d_fp32 test_layernorm2d_fp32.cpp) +if(result EQUAL 0) target_link_libraries(test_layernorm2d_fp32 PRIVATE utility device_normalization_instance) - target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance) add_dependencies(test_normalization test_layernorm2d_fp32) +endif() +add_gtest_executable(test_groupnorm_fp32 test_groupnorm_fp32.cpp) +if(result EQUAL 0) + target_link_libraries(test_groupnorm_fp32 PRIVATE utility device_normalization_instance) add_dependencies(test_normalization test_groupnorm_fp32) endif() -if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) - add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) - add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp) +add_gtest_executable(test_layernorm2d_fp16 test_layernorm2d_fp16.cpp) +if(result EQUAL 0) target_link_libraries(test_layernorm2d_fp16 PRIVATE utility device_normalization_instance) - target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance) add_dependencies(test_normalization test_layernorm2d_fp16) +endif() +add_gtest_executable(test_groupnorm_fp16 test_groupnorm_fp16.cpp) +if(result EQUAL 0) + target_link_libraries(test_groupnorm_fp16 PRIVATE utility device_normalization_instance) add_dependencies(test_normalization test_groupnorm_fp16) endif() diff --git a/test/reduce/CMakeLists.txt b/test/reduce/CMakeLists.txt index fb436165ea..fabaf7cc54 100644 --- a/test/reduce/CMakeLists.txt +++ b/test/reduce/CMakeLists.txt @@ -1,7 +1,5 @@ add_test_executable(test_reduce_no_index reduce_no_index.cpp) add_test_executable(test_reduce_with_index reduce_with_index.cpp) -target_link_libraries(test_reduce_no_index PRIVATE utility) -target_link_libraries(test_reduce_no_index PRIVATE device_reduce_instance) -target_link_libraries(test_reduce_with_index PRIVATE utility) -target_link_libraries(test_reduce_with_index PRIVATE device_reduce_instance) +target_link_libraries(test_reduce_no_index PRIVATE utility device_reduce_instance) +target_link_libraries(test_reduce_with_index PRIVATE utility device_reduce_instance)