From ee93500dad3eeb55e3f730d88ea5d16b9b3b3a94 Mon Sep 17 00:00:00 2001 From: Illia Silin <98187287+illsilin@users.noreply.github.com> Date: Mon, 7 Oct 2024 08:18:23 -0700 Subject: [PATCH] Fix build logic using GRU_ARCHS. (#1536) * update build logic with GPU_ARCHS * fix the GPU_ARCHS build for codegen * unset GPU_TARGETS when GPU_ARCHS are set [ROCm/composable_kernel commit: 7d8ea5f08bfea303b978c3fcb4f5b7069985b0ff] --- CMakeLists.txt | 101 +++++++----------- Jenkinsfile | 4 +- README.md | 11 +- codegen/test/CMakeLists.txt | 3 +- example/CMakeLists.txt | 13 +-- include/ck/config.h.in | 7 -- .../gpu/CMakeLists.txt | 19 +--- profiler/src/CMakeLists.txt | 12 +-- test/CMakeLists.txt | 16 +-- 9 files changed, 64 insertions(+), 122 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc73b5f4d4..989995d0f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,11 +98,6 @@ if(DL_KERNELS) set(CK_ENABLE_DL_KERNELS "ON") endif() -if(INSTANCES_ONLY) - add_definitions(-DINSTANCES_ONLY) - set(CK_ENABLE_INSTANCES_ONLY "ON") -endif() - include(getopt) # CK version file to record release version as well as git commit hash @@ -127,6 +122,12 @@ rocm_setup_version(VERSION ${version}) list(APPEND CMAKE_PREFIX_PATH ${CMAKE_INSTALL_PREFIX} ${CMAKE_INSTALL_PREFIX}/llvm ${CMAKE_INSTALL_PREFIX}/hip /opt/rocm /opt/rocm/llvm /opt/rocm/hip "$ENV{ROCM_PATH}" "$ENV{HIP_PATH}") message("GPU_TARGETS= ${GPU_TARGETS}") +message("GPU_ARCHS= ${GPU_ARCHS}") +if(GPU_ARCHS) + #disable GPU_TARGETS to avoid conflicts, this needs to happen before we call hip package + unset(GPU_TARGETS CACHE) + unset(AMDGPU_TARGETS CACHE) +endif() find_package(hip) # No assumption that HIP kernels are launched with uniform block size for backward compatibility @@ -135,55 +136,38 @@ math(EXPR hip_VERSION_FLAT "(${hip_VERSION_MAJOR} * 1000 + ${hip_VERSION_MINOR}) message("hip_version_flat=${hip_VERSION_FLAT}") message("checking which targets are supported") -#This is the list of targets to be used in case GPU_TARGETS is not set on command line -#These targets will be filtered and only supported ones will be used -#Setting GPU_TARGETS on command line will override this list -if(NOT PROFILER_ONLY) - if(NOT ENABLE_ASAN_PACKAGING) - #build CK for all supported targets - if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000) - # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above - rocm_check_target_ids(DEFAULT_GPU_TARGETS - TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") - else() - rocm_check_target_ids(DEFAULT_GPU_TARGETS - TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201") - endif() +#In order to build just the CK library (without tests and examples) for all supported GPU targets +#use -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" +#the GPU_TARGETS flag will be reset in this case in order to avoid conflicts. +# +#In order to build CK along with all tests and examples it should be OK to set GPU_TARGETS to just 1 or 2 similar architectures. +if(NOT ENABLE_ASAN_PACKAGING) + if(NOT WIN32 AND ${hip_VERSION_FLAT} LESS 600300000) + # WORKAROUND: compiler does not yet fully support gfx12 targets, need to fix version above + set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") else() - #build CK only for xnack-supported targets - rocm_check_target_ids(DEFAULT_GPU_TARGETS - TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+") - set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE) + set(CK_GPU_TARGETS "gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201") endif() else() - add_definitions(-DPROFILER_ONLY) - set(GPU_TARGETS "" CACHE STRING "" FORCE) + #build CK only for xnack-supported targets when using ASAN + set(CK_GPU_TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx940:xnack+;gfx941:xnack+;gfx942:xnack+") +endif() + +#if user set GPU_ARCHS on the cmake command line, overwrite default target list with user's list +#otherwise, if user set GPU_TARGETS, use that set of targets +if(GPU_ARCHS) + set(CK_GPU_TARGETS ${GPU_ARCHS}) +else() if(GPU_TARGETS) - message(FATAL_ERROR "For PROFILE_ONLY build, please do not set GPU_TARGETS, use GPU_ARCH = gfx90, gfx94, gfx10, gfx11 or gfx12") + set(CK_GPU_TARGETS ${GPU_TARGETS}) endif() - if(GPU_ARCH MATCHES "gfx90") - rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx908;gfx90a") - elseif(GPU_ARCH MATCHES "gfx94") - rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx940;gfx941;gfx942") - elseif(GPU_ARCH MATCHES "gfx10") - rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1030") - elseif(GPU_ARCH MATCHES "gfx11") - rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1100;gfx1101;gfx1102") - elseif(GPU_ARCH MATCHES "gfx12") - rocm_check_target_ids(DEFAULT_GPU_TARGETS TARGETS "gfx1200;gfx1201") - else() - message(FATAL_ERROR "For PROFILE_ONLY build, please specify GPU_ARCH as gfx90, gfx94, gfx10, gfx11 or gfx12") - endif() - set(GPU_TARGETS "${DEFAULT_GPU_TARGETS}" CACHE STRING " " FORCE) endif() -message("Supported GPU_TARGETS= ${DEFAULT_GPU_TARGETS}") +#make sure all the targets on the list are actually supported by the current compiler +rocm_check_target_ids(SUPPORTED_GPU_TARGETS + TARGETS ${CK_GPU_TARGETS}) -if(GPU_TARGETS) - message("Building CK for the following targets: ${GPU_TARGETS}") -else() - message("Building CK for the default targets: ${DEFAULT_GPU_TARGETS}") -endif() +message("Building CK for the following targets: ${SUPPORTED_GPU_TARGETS}") if (GPU_TARGETS) if (GPU_TARGETS MATCHES "gfx9") @@ -557,8 +541,7 @@ ENDFOREACH() add_custom_target(instances DEPENDS utility;${CK_DEVICE_INSTANCES} SOURCES ${INSTANCE_FILES}) add_subdirectory(library) -if(NOT DEFINED INSTANCES_ONLY) - if(NOT DEFINED PROFILER_ONLY) +if(NOT GPU_ARCHS) rocm_package_setup_component(tests LIBRARY_NAME composablekernel PACKAGE_NAME tests # Prevent -static suffix on package name @@ -572,23 +555,15 @@ if(NOT DEFINED INSTANCES_ONLY) if(BUILD_TESTING) add_subdirectory(test) endif() - - rocm_package_setup_component(profiler - LIBRARY_NAME composablekernel - PACKAGE_NAME ckprofiler - ) - add_subdirectory(profiler) - else() - #When building PROFILER_ONLY, label the package with GPU_ARCH - rocm_package_setup_component(profiler - LIBRARY_NAME composablekernel - PACKAGE_NAME ckprofiler_${GPU_ARCH} - ) - add_subdirectory(profiler) - endif() endif() -if(NOT DEFINED PROFILER_ONLY AND (GPU_TARGETS MATCHES "gfx9" OR DEFINED INSTANCES_ONLY)) +rocm_package_setup_component(profiler + LIBRARY_NAME composablekernel + PACKAGE_NAME ckprofiler +) +add_subdirectory(profiler) + +if(GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS) add_subdirectory(codegen) endif() diff --git a/Jenkinsfile b/Jenkinsfile index 22468401dc..e61fb71e8e 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1138,8 +1138,8 @@ pipeline { execute_args = """ cmake -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER="${build_compiler()}" \ -D CMAKE_BUILD_TYPE=Release \ - -D INSTANCES_ONLY=ON \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """ + -D GPU_ARCHS="gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201" \ + -D CMAKE_CXX_FLAGS=" -O3 " .. && make -j64 """ } steps{ buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args) diff --git a/README.md b/README.md index 4889914691..34ac0919ae 100644 --- a/README.md +++ b/README.md @@ -90,7 +90,12 @@ Docker images are available on [DockerHub](https://hub.docker.com/r/rocm/composa ``` If you don't set `GPU_TARGETS` on the cmake command line, CK is built for all GPU targets - supported by the current compiler (this may take a long time). + supported by the current compiler (this may take a long time). + + NOTE: If you try setting `GPU_TARGETS` to a list of architectures, the build will only work if the + architectures are similar, e.g., `gfx908;gfx90a`, or `gfx1100;gfx1101;gfx11012`. Otherwise, if you + want to build the library for a list of different architectures, + you should use the `GPU_ARCHS` build argument, for example `GPU_ARCHS=gfx908;gfx1030;gfx1100;gfx942`. 4. Build the entire CK library: @@ -137,10 +142,6 @@ crash. In such cases, you can reduce the number of threads to 32 by using `-j32` Additional cmake flags can be used to significantly speed-up the build: -* `INSTANCES_ONLY` (default is OFF) must be set to ON in order to build only the instances and library - while skipping all tests, examples, and profiler. This is useful in cases when you plan to use CK as a - dependency and don't plan to run any examples or tests. - * `DTYPES` (default is not set) can be set to any subset of "fp64;fp32;fp16;fp8;bf16;int8" to build instances of select data types only. The main default data types are fp32 and fp16; you can safely skip other data types. diff --git a/codegen/test/CMakeLists.txt b/codegen/test/CMakeLists.txt index 6dd130bc3f..1de612e49a 100644 --- a/codegen/test/CMakeLists.txt +++ b/codegen/test/CMakeLists.txt @@ -1,7 +1,8 @@ list(APPEND CMAKE_PREFIX_PATH /opt/rocm) add_subdirectory(rtc) file(GLOB TEST_SRCS CONFIGURE_DEPENDS *.cpp) -if(NOT INSTANCES_ONLY) +# do not build the tests when we build the library for various targets +if(NOT GPU_ARCHS) foreach(TEST_SRC ${TEST_SRCS}) set_source_files_properties(${TEST_SRC} PROPERTIES LANGUAGE HIP) get_filename_component(BASE_NAME ${TEST_SRC} NAME_WE) diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index f9e62a2356..ad3f7c787f 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -45,11 +45,7 @@ function(add_example_executable EXAMPLE_NAME FILE_NAME) endforeach() endif() - if(INSTANCES_ONLY) - set(EX_TARGETS ${DEFAULT_GPU_TARGETS}) - else() - set(EX_TARGETS ${GPU_TARGETS}) - endif() + set(EX_TARGETS ${SUPPORTED_GPU_TARGETS}) #Do not build any DL examples if DL_KERNELS not set foreach(source IN LISTS FILE_NAME) @@ -147,11 +143,8 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME) endforeach() endif() - if(INSTANCES_ONLY) - set(EX_TARGETS ${DEFAULT_GPU_TARGETS}) - else() - set(EX_TARGETS ${GPU_TARGETS}) - endif() + set(EX_TARGETS ${SUPPORTED_GPU_TARGETS}) + #Do not build any DL examples if DL_KERNELS not set foreach(source IN LISTS FILE_NAME) if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") diff --git a/include/ck/config.h.in b/include/ck/config.h.in index eb9049b599..0f0b7bd607 100644 --- a/include/ck/config.h.in +++ b/include/ck/config.h.in @@ -97,13 +97,6 @@ #cmakedefine CK_ENABLE_DL_KERNELS @CK_ENABLE_DL_KERNELS@ #endif -// -// Instances supports in the current CK build -// -#ifndef CK_ENABLE_INSTANCES_ONLY -#cmakedefine CK_ENABLE_INSTANCES_ONLY @CK_ENABLE_INSTANCES_ONLY@ -#endif - // // CK kernels which support XDL (MI series) // diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt index bc66fe0bed..f82176ffc6 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt @@ -37,11 +37,7 @@ function(add_instance_library INSTANCE_NAME) endforeach() endif() - if(INSTANCES_ONLY) - set(INST_TARGETS ${DEFAULT_GPU_TARGETS}) - else() - set(INST_TARGETS ${GPU_TARGETS}) - endif() + set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) # Do not build DL instances if DL_KERNELS macro is not set foreach(source IN LISTS ARGN) @@ -75,11 +71,7 @@ function(add_instance_library INSTANCE_NAME) if(ARGN) set(INST_OBJ) foreach(source IN LISTS ARGN) - if(INSTANCES_ONLY) - set(INST_TARGETS ${DEFAULT_GPU_TARGETS}) - else() - set(INST_TARGETS ${GPU_TARGETS}) - endif() + set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) if(source MATCHES "_xdl") list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201) elseif(ARGN MATCHES "_wmma") @@ -191,12 +183,7 @@ FOREACH(subdir_path ${dir_list}) set(add_inst 1) endif() - if(INSTANCES_ONLY) - set(INST_TARGETS ${DEFAULT_GPU_TARGETS}) - else() - set(INST_TARGETS ${GPU_TARGETS}) - endif() - + set(INST_TARGETS ${SUPPORTED_GPU_TARGETS}) if(("${cmake_instance}" MATCHES "quantization") AND (DEFINED DTYPES) AND (NOT DTYPES MATCHES "int8")) message("quantization instances will not be built!") diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index e9528baeb6..7d4df3cf9b 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -24,7 +24,7 @@ set(PROFILER_SOURCES profile_permute_scale.cpp ) -if(GPU_TARGETS MATCHES "gfx9") +if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp) list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp) @@ -49,7 +49,7 @@ if(GPU_TARGETS MATCHES "gfx9") list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp) endif() list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp) - if(GPU_TARGETS MATCHES "gfx94") + if(SUPPORTED_GPU_TARGETS MATCHES "gfx94") list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp) list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp) endif() @@ -69,7 +69,7 @@ if(GPU_TARGETS MATCHES "gfx9") endif() -if(GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12" OR GPU_TARGETS MATCHES "gfx9") +if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9") if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp) endif() @@ -111,7 +111,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_inst target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance) -if(GPU_TARGETS MATCHES "gfx9") +if(SUPPORTED_GPU_TARGETS MATCHES "gfx9") if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance) @@ -135,7 +135,7 @@ if(GPU_TARGETS MATCHES "gfx9") target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance) - if(GPU_TARGETS MATCHES "gfx94") + if(SUPPORTED_GPU_TARGETS MATCHES "gfx94") target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance) endif() @@ -159,7 +159,7 @@ if(GPU_TARGETS MATCHES "gfx9") target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance) endif() -if(GPU_TARGETS MATCHES "gfx9" OR GPU_TARGETS MATCHES "gfx11" OR GPU_TARGETS MATCHES "gfx12") +if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12") if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance) endif() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e61d937f08..b836dd687e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -41,11 +41,7 @@ function(add_test_executable TEST_NAME) endforeach() endif() - if(INSTANCES_ONLY) - set(TEST_TARGETS ${DEFAULT_GPU_TARGETS}) - else() - set(TEST_TARGETS ${GPU_TARGETS}) - endif() + set(TEST_TARGETS ${SUPPORTED_GPU_TARGETS}) foreach(source IN LISTS ARGN) if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") @@ -122,11 +118,7 @@ function(add_gtest_executable TEST_NAME) endforeach() endif() - if(INSTANCES_ONLY) - set(TEST_TARGETS ${DEFAULT_GPU_TARGETS}) - else() - set(TEST_TARGETS ${GPU_TARGETS}) - endif() + set(TEST_TARGETS ${SUPPORTED_GPU_TARGETS}) foreach(source IN LISTS ARGN) if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl") @@ -211,10 +203,10 @@ add_subdirectory(conv_tensor_rearrange) add_subdirectory(transpose) add_subdirectory(permute_scale) add_subdirectory(wrapper) -if(GPU_TARGETS MATCHES "gfx11") +if(SUPPORTED_GPU_TARGETS MATCHES "gfx11") add_subdirectory(wmma_op) endif() -if(GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2 +if(SUPPORTED_GPU_TARGETS MATCHES "gfx942" AND CK_HIP_VERSION_MAJOR GREATER_EQUAL 6 AND CK_HIP_VERSION_MINOR GREATER_EQUAL 2) # smfmac needs ROCm6.2 add_subdirectory(smfmac_op) endif() add_subdirectory(position_embedding)