From 309d823056ad997fd0f7cace2ec707244709ebb7 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 21 May 2026 12:06:40 +0200 Subject: [PATCH] [rocm-libraries] ROCm/rocm-libraries#7466 (commit cc2861f) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [CK Tile] Enable hardware OOB buffer load offset trick by default (#7466) ## Summary Enables `CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK` inside `config.hpp`. ### Background When loading from global memory with out-of-bound (OOB) check, CK Tile must suppress invalid lanes. The previous default used a software branch: ```cpp // Old path (oob_conditional_check, no trick) if(!src_thread_element_valid) { return zeros; } return amd_buffer_load_impl(...); ``` This generates divergent control flow, the compiler emits exec-mask save/restore and per-lane comparison SALU instructions one set per buffer load that touches a padded dimension. ### Change With the trick enabled, invalid lanes are suppressed entirely in hardware: ```cpp // New path (trick enabled) uint32_t shift = src_thread_element_valid ? 0 : 0x80000000; return amd_buffer_load_impl(resource, shift + offset, 0); ``` The `0x80000000` offset overflows the buffer descriptor's declared size, causing the hardware to silently return zero for that lane - no branch, no exec mask manipulation. This matches the behavior of old CK XDL kernels, which use an unconditional load followed by a `v_cndmask` select. ### Expected impact Eliminates ALU overhead from OOB validity branches which reduces the kernel execution time, especially for memory-bound cases. --------- Co-authored-by: Bartłomiej Kocot --- .../grouped_convolution_tile_instances/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/experimental/grouped_convolution_tile_instances/CMakeLists.txt b/experimental/grouped_convolution_tile_instances/CMakeLists.txt index 94639c65ca..db42525a93 100644 --- a/experimental/grouped_convolution_tile_instances/CMakeLists.txt +++ b/experimental/grouped_convolution_tile_instances/CMakeLists.txt @@ -25,17 +25,17 @@ if(GPU_TARGETS MATCHES "gfx9") add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE}) target_include_directories(device_grouped_conv_fwd_tile_instances PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils") - target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0) + target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1) file(GLOB_RECURSE GROUPED_CONV_BWD_WEIGHT_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_weight/*.cpp") add_instance_library(device_grouped_conv_bwd_weight_tile_instances ${GROUPED_CONV_BWD_WEIGHT_TILE}) target_include_directories(device_grouped_conv_bwd_weight_tile_instances PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils") - target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0) + target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1) file(GLOB_RECURSE GROUPED_CONV_BWD_DATA_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_data/*.cpp") add_instance_library(device_grouped_conv_bwd_data_tile_instances ${GROUPED_CONV_BWD_DATA_TILE}) target_include_directories(device_grouped_conv_bwd_data_tile_instances PRIVATE "${PROJECT_SOURCE_DIR}/experimental/builder/test/utils") - target_compile_options(device_grouped_conv_bwd_data_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0) + target_compile_options(device_grouped_conv_bwd_data_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1) endif()