diff --git a/Jenkinsfile b/Jenkinsfile index 9d1af7c5d9..efe08a7d41 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -157,9 +157,9 @@ def getDockerImage(Map conf=[:]){ image = getDockerImageName() echo "Using default docker: ${image}" } - //Check if image exists + //Check if image exists def retimage - try + try { echo "Pulling image: ${image}" retimage = docker.image("${image}") @@ -232,7 +232,7 @@ def cmake_build(Map conf=[:]){ def setup_args = conf.get("setup_args","") // make sure all unit tests always run on develop branch def runAllUnitTests = (env.BRANCH_NAME == "develop") ? true : params.RUN_ALL_UNIT_TESTS - + if (prefixpath != "/usr/local"){ setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} " } @@ -357,7 +357,7 @@ def cmake_build(Map conf=[:]){ "build_cmd", "${build_envs} ninja -j${nt} ${config_targets}" ) - + cmd = conf.get("cmd", """ ${setup_cmd} ${build_cmd} @@ -449,7 +449,7 @@ def buildHipClangJob(Map conf=[:]){ checkout scm def prefixpath = conf.get("prefixpath", "/opt/rocm") - // Jenkins is complaining about the render group + // Jenkins is complaining about the render group def dockerOpts if ( params.BUILD_INSTANCES_ONLY ){ dockerOpts = "--group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" @@ -515,7 +515,7 @@ def Build_CK(Map conf=[:]){ checkout scm def prefixpath = conf.get("prefixpath", "/opt/rocm") - // Jenkins is complaining about the render group + // Jenkins is complaining about the render group def dockerOpts="--device=/dev/kfd --device=/dev/dri --group-add video --group-add render --cap-add=SYS_PTRACE --security-opt seccomp=unconfined" if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1 " @@ -719,7 +719,7 @@ def process_results(Map conf=[:]){ def image = "${env.CK_DOCKERHUB}:ck_ub22.04_rocm6.3" def prefixpath = "/opt/rocm" - // Jenkins is complaining about the render group + // Jenkins is complaining about the render group def dockerOpts="--cap-add=SYS_PTRACE --security-opt seccomp=unconfined" if (conf.get("enforce_xnack_on", false)) { dockerOpts = dockerOpts + " --env HSA_XNACK=1 " @@ -956,20 +956,20 @@ pipeline { defaultValue: '', description: 'If you want to use a custom docker image, please specify it here (default: leave blank).') string( - name: 'ROCMVERSION', + name: 'ROCMVERSION', defaultValue: '6.4.1', description: 'Specify which ROCM version to use: 6.4.1 (default).') string( - name: 'COMPILER_VERSION', - defaultValue: '', + name: 'COMPILER_VERSION', + defaultValue: '', description: 'Specify which version of compiler to use: release, amd-staging, amd-mainline, or leave blank (default).') string( - name: 'COMPILER_COMMIT', - defaultValue: '', + name: 'COMPILER_COMMIT', + defaultValue: '', description: 'Specify which commit of compiler branch to use: leave blank to use the latest commit (default), or use some specific commit of llvm-project branch.') string( - name: 'BUILD_COMPILER', - defaultValue: '/opt/rocm/llvm/bin/clang++', + name: 'BUILD_COMPILER', + defaultValue: '/opt/rocm/llvm/bin/clang++', description: 'Build CK with /opt/rocm/bin/hipcc, /llvm-project/build/bin/clang++, or with /opt/rocm/llvm/bin/clang++ (default).') booleanParam( name: "RUN_FULL_QA", @@ -1448,6 +1448,36 @@ pipeline { cleanWs() } } + stage("Run TILE_ENGINE_GEMM Tests on gfx1201") + { + when { + beforeAgent true + expression { params.RUN_TILE_ENGINE_GEMM_TESTS.toBoolean() } + } + agent{ label rocmnode("gfx1201") } + environment{ + setup_args = "NO_CK_BUILD" + execute_args = """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ + -D CMAKE_CXX_COMPILER="${build_compiler()}" \ + -D CMAKE_BUILD_TYPE=Release \ + -D GPU_TARGETS="gfx1201" \ + -D GEMM_DATATYPE="fp16" \ + -D GEMM_LAYOUT="rcr;rrr;crr;ccr" \ + -DGEMM_CONFIG_FILE=gfx120x_config.json \ + -DCMAKE_CXX_FLAGS=" -O3 " .. && \ + ninja -j64 benchmark_gemm_all && \ + python3 ../tile_engine/ops/gemm/gemm_benchmark.py . --problem-sizes "1024,1024,1024" \ + --warmup 5 --repeat 5 --verbose --json results.json && \ + ninja -j64 benchmark_gemm_fp16_rcr && \ + ninja -j64 benchmark_gemm_fp16_rrr && \ + ninja -j64 benchmark_gemm_fp16_crr && \ + ninja -j64 benchmark_gemm_fp16_ccr """ + } + steps{ + buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args) + cleanWs() + } + } } } @@ -1591,7 +1621,7 @@ pipeline { agent{ label rocmnode("gfx942") } steps{ script { - def execute_args = params.NINJA_FTIME_TRACE ? + def execute_args = params.NINJA_FTIME_TRACE ? """ cmake -G Ninja -D CMAKE_PREFIX_PATH=/opt/rocm \ -D CMAKE_CXX_COMPILER="${build_compiler()}" \ -D CMAKE_BUILD_TYPE=Release \ @@ -1600,7 +1630,7 @@ pipeline { -D CMAKE_CXX_COMPILER="${build_compiler()}" \ -D CMAKE_BUILD_TYPE=Release \ -D CMAKE_CXX_FLAGS=" -O3 " .. && ninja -j64 """ - + buildHipClangJobAndReboot(setup_cmd: "", build_cmd: "", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub24.04_rocm7.0") } cleanWs() diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh index 086359a79f..6220009b03 100755 --- a/script/cmake-ck-dev.sh +++ b/script/cmake-ck-dev.sh @@ -20,7 +20,7 @@ fi GPU_TARGETS="gfx908;gfx90a;gfx942" if [ $# -ge 1 ]; then - case "$1" in + case "$1" in gfx*) GPU_TARGETS=$1 shift 1 @@ -38,7 +38,7 @@ fi cmake \ -D CMAKE_PREFIX_PATH=/opt/rocm/ \ -D CMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ \ --D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker" \ +-D CMAKE_CXX_FLAGS="-ftemplate-backtrace-limit=0 -fPIE -Wno-gnu-line-marker -fbracket-depth=512" \ -D CMAKE_BUILD_TYPE=Release \ -D BUILD_DEV=ON \ -D GPU_TARGETS=$GPU_TARGETS \ diff --git a/tile_engine/ops/gemm/CMakeLists.txt b/tile_engine/ops/gemm/CMakeLists.txt index d52351af2d..77165ae0fa 100644 --- a/tile_engine/ops/gemm/CMakeLists.txt +++ b/tile_engine/ops/gemm/CMakeLists.txt @@ -13,38 +13,38 @@ function(create_individual_gemm_target datatype layout trait tile_config config_ message(WARNING "Skipping individual GEMM target ${datatype}_${layout}_${trait}_${tile_config}: No supported GPU targets") return() endif() - + # Parse tile configuration: format is tile_mxtile_nxtile_k_warp_mxwarp_nxwarp_k_warp_tile_mxwarp_tile_nxwarp_tile_k # First split by underscore to get three groups string(REPLACE "_" ";" config_groups ${tile_config}) list(GET config_groups 0 tile_dims) # e.g., 256x256x32 list(GET config_groups 1 warp_dims) # e.g., 4x1x1 list(GET config_groups 2 warp_tile_dims) # e.g., 16x16x16 - + # Parse tile dimensions string(REPLACE "x" ";" tile_parts ${tile_dims}) list(GET tile_parts 0 tile_m) list(GET tile_parts 1 tile_n) list(GET tile_parts 2 tile_k) - + # Parse warp dimensions string(REPLACE "x" ";" warp_parts ${warp_dims}) list(GET warp_parts 0 warp_m) list(GET warp_parts 1 warp_n) list(GET warp_parts 2 warp_k) - + # Parse warp tile dimensions string(REPLACE "x" ";" warp_tile_parts ${warp_tile_dims}) list(GET warp_tile_parts 0 warp_tile_m) list(GET warp_tile_parts 1 warp_tile_n) list(GET warp_tile_parts 2 warp_tile_k) - + set(target_name "benchmark_gemm_${datatype}_${layout}_${trait}_${tile_config}") set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}") - + # Generate the single instance header for this kernel set(instance_header "${working_path}/gemm_single_${datatype}_${layout}_${trait}_${tile_config}.hpp") - + # Add custom command to generate the header file at build time add_custom_command( OUTPUT ${instance_header} @@ -60,27 +60,27 @@ function(create_individual_gemm_target datatype layout trait tile_config config_ DEPENDS ${GEMM_SOURCE_DIR}/gemm_instance_builder.py ${config_json} COMMENT "Generating ${instance_header}" ) - + # Create the executable - add_executable(${target_name} + add_executable(${target_name} ${GEMM_SOURCE_DIR}/benchmark_gemm_single.cpp ${instance_header} ) - + # Set GPU architectures set_property(TARGET ${target_name} PROPERTY HIP_ARCHITECTURES ${GEMM_GPU_TARGETS_INDIVIDUAL}) - + # Set compile definitions target_compile_definitions(${target_name} PRIVATE GEMM_SINGLE_INSTANCE_HPP="${instance_header}" ) - + # Include directories target_include_directories(${target_name} PRIVATE ${GEMM_SOURCE_DIR} ${working_path} ) - + # Compile options target_compile_options(${target_name} PRIVATE -Wno-undefined-func-template @@ -88,19 +88,19 @@ function(create_individual_gemm_target datatype layout trait tile_config config_ --offload-compress -include ${instance_header} ) - + # Add to collection targets add_dependencies(benchmark_gemm_all ${target_name}) add_dependencies(benchmark_gemm_${datatype} ${target_name}) add_dependencies(benchmark_gemm_${layout} ${target_name}) add_dependencies(benchmark_gemm_${datatype}_${layout} ${target_name}) - + # Add to trait-specific targets string(REPLACE "_" ";" trait_parts ${trait}) list(GET trait_parts 0 pipeline) list(GET trait_parts 1 epilogue) list(GET trait_parts 2 scheduler) - + add_dependencies(benchmark_gemm_${pipeline} ${target_name}) add_dependencies(benchmark_gemm_${epilogue} ${target_name}) add_dependencies(benchmark_gemm_${scheduler} ${target_name}) @@ -109,13 +109,13 @@ endfunction() # Function to build individual GEMM targets function(build_individual_gemm_targets datatype layout) set(working_path "${CMAKE_CURRENT_BINARY_DIR}/${datatype}/${layout}") - + # Choose config file # Priority order: # 1. Environment variable GEMM_CONFIG_FILE - # 2. CMake variable GEMM_CONFIG_FILE + # 2. CMake variable GEMM_CONFIG_FILE # 3. Default based on layout - + # Check environment variable first if(DEFINED ENV{GEMM_CONFIG_FILE} AND NOT "$ENV{GEMM_CONFIG_FILE}" STREQUAL "") set(config_filename "$ENV{GEMM_CONFIG_FILE}") @@ -130,12 +130,12 @@ function(build_individual_gemm_targets datatype layout) set(json_blob "${CMAKE_CURRENT_LIST_DIR}/configs/default_config.json") message(STATUS " Using default config for layout ${layout}") endif() - + # Check if config file exists if(NOT EXISTS ${json_blob}) message(FATAL_ERROR "Config file not found: ${json_blob}") endif() - + # Determine number of workers for parallel generation if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL}) set(num_workers $ENV{CMAKE_BUILD_PARALLEL_LEVEL}) @@ -147,17 +147,24 @@ function(build_individual_gemm_targets datatype layout) set(num_workers 8) endif() endif() - + # Generate individual kernel files using parallel version message(STATUS "Generating individual kernels for ${datatype} ${layout} using ${num_workers} workers...") message(STATUS " Working path: ${working_path}") message(STATUS " Config file: ${json_blob}") message(STATUS " Python executable: ${Python3_EXECUTABLE}") message(STATUS " Script path: ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py") - + # Create working directory first file(MAKE_DIRECTORY ${working_path}) - + + message(STATUS "COMMAND: ${Python3_EXECUTABLE} -u ${CMAKE_CURRENT_LIST_DIR}/gemm_instance_builder.py + --working_path ${working_path} + --datatype ${datatype} + --layout ${layout} + --config_json ${json_blob} + --list_kernels") + # First, just list the kernels (fast operation) message(STATUS " Listing kernel configurations...") execute_process( @@ -172,11 +179,11 @@ function(build_individual_gemm_targets datatype layout) OUTPUT_VARIABLE list_output ERROR_VARIABLE list_error ) - + if(NOT ret EQUAL 0) message(FATAL_ERROR "Failed to list kernels for ${datatype} ${layout}: ${list_error}") endif() - + # Read kernel count if(EXISTS ${working_path}/gemm_kernel_count.txt) file(READ ${working_path}/gemm_kernel_count.txt kernel_count) @@ -185,7 +192,7 @@ function(build_individual_gemm_targets datatype layout) else() message(FATAL_ERROR "Kernel count file not found") endif() - + # Read kernel list and create targets if(EXISTS ${working_path}/gemm_kernel_list.txt) file(STRINGS ${working_path}/gemm_kernel_list.txt kernel_lines) @@ -195,7 +202,7 @@ function(build_individual_gemm_targets datatype layout) list(GET parts 0 kernel_name) list(GET parts 1 tile_config) list(GET parts 2 trait_combo) - + # Create individual target create_individual_gemm_target("${datatype}" "${layout}" "${trait_combo}" "${tile_config}" "${json_blob}") endforeach() @@ -210,9 +217,9 @@ message(STATUS "GEMM_DATATYPE: ${GEMM_DATATYPE}") message(STATUS "GEMM_LAYOUT: ${GEMM_LAYOUT}") message(STATUS "SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}") -# Filter GPU targets to only gfx90a, gfx942, and gfx950 +# Filter GPU targets to only gfx90a, gfx942, gfx950, gfx1201 set(GEMM_GPU_TARGETS_INDIVIDUAL "") -set(DESIRED_TARGETS "gfx90a;gfx942;gfx950") +set(DESIRED_TARGETS "gfx90a;gfx942;gfx950;gfx1201") foreach(target IN LISTS SUPPORTED_GPU_TARGETS) if(target IN_LIST DESIRED_TARGETS) @@ -223,13 +230,13 @@ endforeach() # Skip build if no matching targets found if(NOT GEMM_GPU_TARGETS_INDIVIDUAL) - message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}") + message(WARNING "Skipping Tile Engine GEMM build: No supported GPU targets (gfx90a, gfx942, gfx950, gfx1201) found in SUPPORTED_GPU_TARGETS: ${SUPPORTED_GPU_TARGETS}") else() message(STATUS "Building individual GEMM targets for GPU targets: ${GEMM_GPU_TARGETS_INDIVIDUAL}") # Enable parallel compilation optimizations # Set up job pools for better parallel compilation control - set_property(GLOBAL PROPERTY JOB_POOLS + set_property(GLOBAL PROPERTY JOB_POOLS compile_heavy=4 # Limit heavy compilations to prevent OOM compile_normal=16 # Allow more parallel normal compilations ) diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py index 6a87193043..98595933b8 100644 --- a/tile_engine/ops/gemm/codegen_utils.py +++ b/tile_engine/ops/gemm/codegen_utils.py @@ -179,6 +179,11 @@ warp_tile_supported_combinations = { [32, 32, 64], ], }, + "gfx1201": { + "fp16_fp16_fp16": [ + [16, 16, 16], + ], + }, } # To Do: remove some unsupported combinations diff --git a/tile_engine/ops/gemm/configs/gfx120x_config.json b/tile_engine/ops/gemm/configs/gfx120x_config.json new file mode 100644 index 0000000000..6c4a5d0ec0 --- /dev/null +++ b/tile_engine/ops/gemm/configs/gfx120x_config.json @@ -0,0 +1,102 @@ +{ + "problem": { + }, + "tile_config": { + "tile_m": { + "values": [ + 256, + 128, + 64 + ] + }, + "tile_n": { + "values": [ + 256, + 128, + 64 + ] + }, + "tile_k": { + "values": [ + 256, + 128, + 64 + ] + }, + "warp_m": { + "values": [ + 4, + 2, + 1 + ] + }, + "warp_n": { + "values": [ + 4, + 2, + 1 + ] + }, + "warp_k": { + "values": [ + 1 + ] + }, + "warp_tile_m": { + "values": [ + 16 + ] + }, + "warp_tile_n": { + "values": [ + 16 + ] + }, + "warp_tile_k": { + "values": [ + 16 + ] + } + }, + "trait_config": { + "pipeline": { + "values": [ + "compv3", + "mem" + ] + }, + "scheduler": { + "values": [ + "intrawave", + "interwave" + ] + }, + "epilogue": { + "values": [ + "cshuffle", + "default" + ] + }, + "pad_m": { + "values": [ + false + ] + }, + "pad_n": { + "values": [ + false + ] + }, + "pad_k": { + "values": [ + false + ] + }, + "persistent": { + "values": [ + false, + true + ] + } + } +} diff --git a/tile_engine/ops/gemm/validation_utils.py b/tile_engine/ops/gemm/validation_utils.py index 7367f2446d..c0e109bf11 100644 --- a/tile_engine/ops/gemm/validation_utils.py +++ b/tile_engine/ops/gemm/validation_utils.py @@ -103,6 +103,36 @@ WARP_TILE_SUPPORTED_COMBINATIONS = { [32, 32, 64], ], }, + "gfx1201": { + "fp16_fp16_fp16": [ + [16, 16, 16], + ], + }, +} + +# Supported warp tile combinations for different GPU architectures and data types +WARP_SUPPORTED_COMBINATIONS = { + "gfx90a": [ + [1, 4, 1], + [2, 2, 1], + [4, 1, 1], + ], + "gfx942": [ + [1, 4, 1], + [2, 2, 1], + [4, 1, 1], + ], + "gfx950": [ + [1, 4, 1], + [2, 2, 1], + [4, 1, 1], + ], + "gfx1201": [ + [2, 4, 1], + [1, 8, 1], + [8, 1, 1], + [4, 2, 1], + ], } # Unsupported trait combinations @@ -155,9 +185,32 @@ def is_trait_combination_valid(pipeline: str, epilogue: str, scheduler: str) -> return (pipeline, epilogue, scheduler) not in TRAIT_UNSUPPORTED_COMBINATIONS -def validate_warp_configuration(warp_m: int, warp_n: int, warp_k: int) -> bool: +def validate_warp_configuration( + warp_m: int, + warp_n: int, + warp_k: int, + gpu_name: str = None, +) -> bool: """Validate warp configuration.""" - return (warp_m, warp_n, warp_k) in [(1, 4, 1), (2, 2, 1), (4, 1, 1)] + if gpu_name is None: + gpu_name = get_gpu_name_by_id(0) + + current_combination = [warp_m, warp_n, warp_k] + + allowed_combinations = WARP_SUPPORTED_COMBINATIONS.get(gpu_name, {}) + if not allowed_combinations: + # If GPU not recognized, try to be permissive but log warning + logging.warning(f"No warp_[m/n/k] combinations found for GPU: {gpu_name}") + return True + + # Check if current combination is in the allowed list + if current_combination not in allowed_combinations: + error_msg = ( + f"Invalid warp tile combination: {current_combination} not in allowed list. " + ) + return False + + return True def validate_dimension_alignment(