mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-12 01:08:27 +00:00
[rocm-libraries] ROCm/rocm-libraries#7466 (commit cc2861f)
[CK Tile] Enable hardware OOB buffer load offset trick by default (#7466) ## Summary Enables `CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK` inside `config.hpp`. ### Background When loading from global memory with out-of-bound (OOB) check, CK Tile must suppress invalid lanes. The previous default used a software branch: ```cpp // Old path (oob_conditional_check, no trick) if(!src_thread_element_valid) { return zeros; } return amd_buffer_load_impl(...); ``` This generates divergent control flow, the compiler emits exec-mask save/restore and per-lane comparison SALU instructions one set per buffer load that touches a padded dimension. ### Change With the trick enabled, invalid lanes are suppressed entirely in hardware: ```cpp // New path (trick enabled) uint32_t shift = src_thread_element_valid ? 0 : 0x80000000; return amd_buffer_load_impl(resource, shift + offset, 0); ``` The `0x80000000` offset overflows the buffer descriptor's declared size, causing the hardware to silently return zero for that lane - no branch, no exec mask manipulation. This matches the behavior of old CK XDL kernels, which use an unconditional load followed by a `v_cndmask` select. ### Expected impact Eliminates ALU overhead from OOB validity branches which reduces the kernel execution time, especially for memory-bound cases. --------- Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
This commit is contained in:
@@ -25,17 +25,17 @@ if(GPU_TARGETS MATCHES "gfx9")
|
||||
add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE})
|
||||
target_include_directories(device_grouped_conv_fwd_tile_instances PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
|
||||
target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
|
||||
target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1)
|
||||
|
||||
file(GLOB_RECURSE GROUPED_CONV_BWD_WEIGHT_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_weight/*.cpp")
|
||||
add_instance_library(device_grouped_conv_bwd_weight_tile_instances ${GROUPED_CONV_BWD_WEIGHT_TILE})
|
||||
target_include_directories(device_grouped_conv_bwd_weight_tile_instances PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
|
||||
target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
|
||||
target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1)
|
||||
|
||||
file(GLOB_RECURSE GROUPED_CONV_BWD_DATA_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_data/*.cpp")
|
||||
add_instance_library(device_grouped_conv_bwd_data_tile_instances ${GROUPED_CONV_BWD_DATA_TILE})
|
||||
target_include_directories(device_grouped_conv_bwd_data_tile_instances PRIVATE
|
||||
"${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
|
||||
target_compile_options(device_grouped_conv_bwd_data_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
|
||||
target_compile_options(device_grouped_conv_bwd_data_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1)
|
||||
endif()
|
||||
|
||||
Reference in New Issue
Block a user