diff --git a/Jenkinsfile b/Jenkinsfile index 0363b07d89..b70c28ad39 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -438,34 +438,6 @@ def cmake_build(Map conf=[:]){ echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." } } - if (params.RUN_CK_TILE_TRANSPOSE_TESTS){ - try{ - archiveArtifacts "perf_transpose_*.log" - if (arch_type == 1){ - stash includes: "perf_transpose_**_gfx90a.log", name: "perf_transpose_log_gfx90a" - } - else if (arch_type == 2){ - stash includes: "perf_transpose_**_gfx942.log", name: "perf_transpose_log_gfx942" - } - } - catch(Exception err){ - echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." - } - } - if (params.RUN_CK_TILE_GEMM_TESTS){ - try{ - archiveArtifacts "perf_tile_gemm_**.log" - if (arch == 1){ - stash includes: "perf_tile_gemm_**_gfx90a.log", name: "perf_tile_gemm_log_gfx90a" - } - else if (arch == 2){ - stash includes: "perf_tile_gemm_**_gfx942.log", name: "perf_tile_gemm_log_gfx942" - } - } - catch(Exception err){ - echo "could not locate the requested artifacts: ${err.getMessage()}. will skip the stashing." - } - } } def buildHipClangJob(Map conf=[:]){ @@ -762,24 +734,6 @@ def process_results(Map conf=[:]){ echo "could not locate the FMHA performance logs: ${err.getMessage()}." } } - if (params.RUN_CK_TILE_TRANSPOSE_TESTS){ - try{ - unstash "perf_transpose_log_gfx942" - unstash "perf_transpose_log_gfx90a" - } - catch(Exception err){ - echo "could not locate the Transpose performance logs: ${err.getMessage()}." - } - } - if (params.RUN_CK_TILE_GEMM_TESTS){ - try{ - unstash "perf_tile_gemm_log_gfx942" - unstash "perf_tile_gemm_log_gfx90a" - } - catch(Exception err){ - echo "could not locate the GEMM performance logs: ${err.getMessage()}." - } - } if (params.RUN_FULL_QA || params.BUILD_INSTANCES_ONLY){ // unstash deb packages unstash "packages" @@ -861,7 +815,7 @@ def run_aiter_tests(Map conf=[:]){ } //launch develop branch daily jobs -CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true +CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX942=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true 0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true 0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true @@ -941,14 +895,6 @@ pipeline { name: "RUN_CK_TILE_FMHA_TESTS", defaultValue: false, description: "Run the ck_tile FMHA tests (default: OFF)") - booleanParam( - name: "RUN_CK_TILE_TRANSPOSE_TESTS", - defaultValue: false, - description: "Run the ck_tile Transpose tests (default: OFF)") - booleanParam( - name: "RUN_CK_TILE_GEMM_TESTS", - defaultValue: false, - description: "Run the ck_tile GEMM tests (default: OFF)") booleanParam( name: "RUN_TILE_ENGINE_GEMM_TESTS", defaultValue: false, @@ -1198,94 +1144,6 @@ pipeline { } } } - stage("Run CK_TILE_TRANSPOSE Tests") - { - parallel - { - stage("Run CK_TILE_TRANSPOSE Tests on gfx90a") - { - when { - beforeAgent true - expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() } - } - agent{ label rocmnode("gfx90a") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ - make -j64 tile_example_batched_transpose && \ - cd ../ && - example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ - } - steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) - cleanWs() - } - } - stage("Run CK_TILE_TRANSPOSE Tests on gfx942") - { - when { - beforeAgent true - expression { params.RUN_CK_TILE_TRANSPOSE_TESTS.toBoolean() } - } - agent{ label rocmnode("gfx942") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ - make -j64 tile_example_batched_transpose && \ - cd ../ && - example/ck_tile/35_batched_transpose/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ - } - steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) - cleanWs() - } - } - } - } - stage("Run CK_TILE_GEMM Tests") - { - parallel - { - stage("Run CK_TILE_GEMM Tests on gfx90a") - { - when { - beforeAgent true - expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } - } - agent{ label rocmnode("gfx90a") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx90a && \ - make -j64 tile_example_gemm_universal && \ - cd ../ && - example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx90a """ - } - steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) - cleanWs() - } - } - stage("Run CK_TILE_GEMM Tests on gfx942") - { - when { - beforeAgent true - expression { params.RUN_CK_TILE_GEMM_TESTS.toBoolean() } - } - agent{ label rocmnode("gfx942") } - environment{ - setup_args = "NO_CK_BUILD" - execute_args = """ ../script/cmake-ck-dev.sh ../ gfx942 && \ - make -j64 tile_example_gemm_universal && \ - cd ../ && - example/ck_tile/03_gemm/script/run_full_test.sh "CI_${params.COMPILER_VERSION}" "${env.BRANCH_NAME}" "${NODE_NAME}" gfx942 """ - } - steps{ - buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) - cleanWs() - } - } - } - } stage("Run TILE_ENGINE_GEMM Tests") { parallel @@ -1492,7 +1350,7 @@ pipeline { -DGPU_TARGETS="gfx90a" \ -DCMAKE_CXX_COMPILER="${build_compiler()}" \ -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \ - -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """ + -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j 32""" } steps{ Build_CK_and_Reboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local') diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt index fe9b7802a7..d8200ed947 100644 --- a/tile_engine/ops/gemm/CMakeLists.txt +++ b/tile_engine/ops/gemm/CMakeLists.txt @@ -1,148 +1,215 @@ - set(GEMM_DATATYPE "fp8;fp16" CACHE STRING "List of datatypes for GEMM (semicolon-separated)") set(GEMM_LAYOUT "rcr" CACHE STRING "List of layout for GEMM (semicolon-separated)") +# Pre-generate all kernel lists to avoid blocking during parallel builds +foreach(dt IN LISTS GEMM_DATATYPE) + foreach(l IN LISTS GEMM_LAYOUT) + set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${dt}/${l}") + file(MAKE_DIRECTORY "${working_path}") + + if (l STREQUAL "rcr") + set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") + else() + set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json") + endif() + + # Only run if files don't exist + if (NOT EXISTS "${working_path}/gemm_instance_blobs.txt") + execute_process( + COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py" + --working_path "${working_path}" + --datatype "${dt}" + --layout "${l}" + --config_json "${json_blob}" + --list_blobs + RESULT_VARIABLE ret + ) + if (NOT ret EQUAL 0) + message(FATAL_ERROR "Failed to pre-generate kernel list for ${dt} ${l}") + endif() + endif() + endforeach() +endforeach() + function(build_gemm_for_datatype datatype layout) set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}") - # Comment this if-else block when using user_provided_config - if(layout STREQUAL "rcr") + if (layout STREQUAL "rcr") set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") else() set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/custom_ci_config.json") endif() - - # uncomment this if you want to use user_provided_config.json + # Uncomment to override: # set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/user_provided_config.json") - - # Generate kernel list - execute_process( - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py - --working_path ${working_path} - --datatype ${datatype} - --layout ${layout} - --config_json ${json_blob} - --list_blobs - RESULT_VARIABLE ret - ) - if(NOT ret EQUAL 0) - message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${ret}") - endif() + # Read pre-generated kernel lists file(STRINGS "${working_path}/gemm_instance_blobs.txt" codegen_blobs) file(STRINGS "${working_path}/gemm_instance_blobs_range.txt" codegen_blobs_range) - + # Generate the blobs add_custom_command( OUTPUT ${codegen_blobs} - COMMAND ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py + COMMAND ${Python3_EXECUTABLE} "${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py" --working_path "${working_path}" - --datatype ${datatype} - --layout ${layout} + --datatype "${datatype}" + --layout "${layout}" --config_json "${json_blob}" --gen_blobs COMMENT "Generating GEMM instance sources for ${datatype} ${layout}" ) add_custom_target(gemm_gen_${datatype}_${layout} DEPENDS ${codegen_blobs}) - set(intermediate_libs) - list(LENGTH codegen_blobs codegen_blobs_len) + # Parse ranges to identify unique trait names + set(unique_traits) + foreach(range_line IN LISTS codegen_blobs_range) + string(STRIP "${range_line}" stripped_line) + separate_arguments(split_line UNIX_COMMAND "${stripped_line}") + list(GET split_line 0 trait_name) + list(APPEND unique_traits "${trait_name}") + endforeach() + list(REMOVE_DUPLICATES unique_traits) - foreach(blob IN LISTS codegen_blobs_range) - string(STRIP "${blob}" stripped_blob) - separate_arguments(spilit_blob UNIX_COMMAND "${stripped_blob}") - # Each line is: - list(GET spilit_blob 0 name) - list(GET spilit_blob 1 first) - list(GET spilit_blob 2 last) - math(EXPR total_files "${last} - ${first}") - if(total_files EQUAL 0) - continue() # nothing for this trait - endif() + # Build each trait separately + foreach(trait IN LISTS unique_traits) + set(trait_files) + foreach(range_line IN LISTS codegen_blobs_range) + string(STRIP "${range_line}" stripped_line) + separate_arguments(split_line UNIX_COMMAND "${stripped_line}") + list(GET split_line 0 name) + if (name STREQUAL trait) + list(GET split_line 1 first) + list(GET split_line 2 last) + math(EXPR total_files "${last} - ${first}") + if (total_files GREATER 0) + foreach(j RANGE ${first} ${last}-1) + list(LENGTH codegen_blobs blobs_len) + if (j LESS blobs_len) + list(GET codegen_blobs ${j} f) + list(APPEND trait_files "${f}") + endif() + endforeach() + endif() + endif() + endforeach() - # Object libraries (chunked) per trait - set(sub_intermediate_libs) - set(chunk_size 3) - math(EXPR num_chunks "( ${total_files} + ${chunk_size} - 1 ) / ${chunk_size}") - math(EXPR num_chunks_minus_1 "${num_chunks} - 1") - - foreach(i RANGE 0 ${num_chunks_minus_1}) - math(EXPR start "${first} + ${i} * ${chunk_size} ") - math(EXPR end "${start} + ${chunk_size} - 1") + if (trait_files) + # Create object libraries with chunking + set(chunk_size 3) # adjust as needed for memory vs parallelism + list(LENGTH trait_files num_files) + math(EXPR num_chunks "( ${num_files} + ${chunk_size} - 1 ) / ${chunk_size}") - set(chunk_files) - foreach(j RANGE ${start} ${end}) - if(j LESS ${last} AND j LESS ${codegen_blobs_len}) - list(GET codegen_blobs ${j} f) - list(APPEND chunk_files "${f}") + set(trait_obj_libs) + foreach(i RANGE 0 ${num_chunks}-1) + math(EXPR start "${i} * ${chunk_size}") + math(EXPR end "${start} + ${chunk_size} - 1") + + set(chunk_files) + foreach(j RANGE ${start} ${end}) + if (j LESS ${num_files}) + list(GET trait_files ${j} f) + list(APPEND chunk_files "${f}") + endif() + endforeach() + + if (chunk_files) + set(obj_lib_name "gemm_obj_${trait}_${i}_${datatype}_${layout}") + add_library(${obj_lib_name} OBJECT ${chunk_files}) + add_dependencies(${obj_lib_name} gemm_gen_${datatype}_${layout}) + + target_compile_options(${obj_lib_name} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal + --offload-compress + -O3 + -fno-exceptions + ) + + set_target_properties(${obj_lib_name} PROPERTIES + UNITY_BUILD ON + UNITY_BUILD_BATCH_SIZE 2 + ) + + list(APPEND trait_obj_libs "${obj_lib_name}") endif() endforeach() - #list(LENGTH chunk_files chunk_files_len) - #if(chunk_files_len AND chunk_files_len GREATER 1) - if(chunk_files) - set(sub_intermediate_lib_name "gemm_objlib_${name}_${i}_${datatype}_${layout}") - add_library(${sub_intermediate_lib_name} OBJECT ${chunk_files}) - list(APPEND sub_intermediate_libs ${sub_intermediate_lib_name}) + # Static library for this trait + if (trait_obj_libs) + set(trait_lib_name "gemm_lib_${trait}_${datatype}_${layout}") + set(obj_exprs) + foreach(objlib IN LISTS trait_obj_libs) + list(APPEND obj_exprs "$") + endforeach() + + add_library(${trait_lib_name} STATIC ${obj_exprs}) + add_dependencies(${trait_lib_name} gemm_gen_${datatype}_${layout}) + + # Trait-specific executable + set(exec_name "benchmark_gemm_${datatype}_${layout}_${trait}") + add_executable(${exec_name} benchmark_gemm.cpp) + target_link_libraries(${exec_name} PRIVATE ${trait_lib_name}) + target_include_directories(${exec_name} PRIVATE + "${CMAKE_CURRENT_LIST_DIR}" + "${working_path}" + ) + target_compile_definitions(${exec_name} PRIVATE + GEMM_TRAIT_FILTER="${trait}" + ) + target_compile_options(${exec_name} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal + --offload-compress + ) endif() - - endforeach() - - # ------------------ Bundle the object libs into one static lib --------- - #list(LENGTH sub_intermediate_libs sub_intermediate_libs_len) - #if(sub_intermediate_libs AND sub_intermediate_libs_len GREATER 1) - if(sub_intermediate_libs) - set(intermediate_lib_name "gemm_staticlib_${name}_${datatype}_${layout}") - # Collect the $ expressions - - set(obj_exprs) - foreach(objlib IN LISTS sub_intermediate_libs) - list(APPEND obj_exprs $) - endforeach() - - add_library(${intermediate_lib_name} STATIC ${obj_exprs}) - add_dependencies(${intermediate_lib_name} gemm_gen_${datatype}_${layout}) - #foreach(objlib IN LISTS sub_intermediate_libs) - # target_sources(${intermediate_lib_name} PRIVATE $) - #endforeach() - list(APPEND intermediate_libs ${intermediate_lib_name}) endif() - endforeach() - - # Interface library for instances - add_library(gemm_template_instances_${datatype}_${layout} INTERFACE) - add_dependencies(gemm_template_instances_${datatype}_${layout} gemm_gen_${datatype}_${layout}) - target_link_libraries(gemm_template_instances_${datatype}_${layout} INTERFACE ${intermediate_libs}) - target_include_directories(gemm_template_instances_${datatype}_${layout} INTERFACE - ${CMAKE_CURRENT_LIST_DIR} - "${working_path}" - ) - set_target_properties(gemm_template_instances_${datatype}_${layout} PROPERTIES LINKER_LANGUAGE CXX) - - # Host API interface library - add_library(gemm_host_api_${datatype}_${layout} INTERFACE) - target_link_libraries(gemm_host_api_${datatype}_${layout} INTERFACE gemm_template_instances_${datatype}_${layout}) - target_include_directories(gemm_host_api_${datatype}_${layout} INTERFACE - ${CMAKE_CURRENT_LIST_DIR} - "${working_path}" - ) - - # Executable per datatype - set(exec_name "benchmark_gemm_${datatype}_${layout}") - add_executable(${exec_name} benchmark_gemm.cpp) - target_link_libraries(${exec_name} PRIVATE gemm_host_api_${datatype}_${layout}) - target_compile_options(${exec_name} PRIVATE - -Wno-undefined-func-template - -Wno-float-equal - --offload-compress - ) + # Master executable including all traits + set(all_trait_libs) + foreach(trait IN LISTS unique_traits) + if (TARGET gemm_lib_${trait}_${datatype}_${layout}) + list(APPEND all_trait_libs "gemm_lib_${trait}_${datatype}_${layout}") + endif() + endforeach() + + if (all_trait_libs) + add_executable(benchmark_gemm_${datatype}_${layout} benchmark_gemm.cpp) + target_link_libraries(benchmark_gemm_${datatype}_${layout} PRIVATE ${all_trait_libs}) + target_include_directories(benchmark_gemm_${datatype}_${layout} PRIVATE + "${CMAKE_CURRENT_LIST_DIR}" + "${working_path}" + ) + target_compile_options(benchmark_gemm_${datatype}_${layout} PRIVATE + -Wno-undefined-func-template + -Wno-float-equal + --offload-compress + ) + endif() endfunction() -# Process each datatype in isolation +# Process each datatype/layout foreach(dt IN LISTS GEMM_DATATYPE) foreach(l IN LISTS GEMM_LAYOUT) - build_gemm_for_datatype(${dt} ${l}) + build_gemm_for_datatype("${dt}" "${l}") endforeach() endforeach() + +# Master target for parallel builds +set(ALL_GEMM_TARGETS) +foreach(dt IN LISTS GEMM_DATATYPE) + foreach(l IN LISTS GEMM_LAYOUT) + list(APPEND ALL_GEMM_TARGETS "benchmark_gemm_${dt}_${l}") + endforeach() +endforeach() +add_custom_target(benchmark_gemm_all DEPENDS ${ALL_GEMM_TARGETS}) + +# Use faster linker if available +find_program(LLD_LINKER "ld.lld") +find_program(MOLD_LINKER "mold") +if (MOLD_LINKER) + message(STATUS "Using mold linker for faster linking") + add_link_options(-fuse-ld=mold) +elseif (LLD_LINKER) + message(STATUS "Using lld linker for faster linking") + add_link_options(-fuse-ld=lld) +endif() \ No newline at end of file