[rocm-libraries] ROCm/rocm-libraries#7466 (commit cc2861f)

[CK Tile] Enable hardware OOB buffer load offset trick by default (#7466)

## Summary

Enables `CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK`
inside `config.hpp`.

  ### Background

When loading from global memory with out-of-bound (OOB) check, CK Tile
must suppress invalid lanes. The previous default used a software
branch:

  ```cpp
  // Old path (oob_conditional_check, no trick)
  if(!src_thread_element_valid) { return zeros; }
      return amd_buffer_load_impl(...);
```

  This generates divergent control flow, the compiler emits exec-mask save/restore and per-lane comparison SALU instructions one set per buffer load that touches a padded dimension.

  ### Change

  With the trick enabled, invalid lanes are suppressed entirely in hardware:
```cpp
  // New path (trick enabled)
uint32_t shift = src_thread_element_valid ? 0 : 0x80000000;
return amd_buffer_load_impl(resource, shift + offset, 0);
```
  The `0x80000000` offset overflows the buffer descriptor's declared size, causing the hardware to silently return zero for that lane - no branch, no exec mask manipulation. This matches the behavior of old CK XDL kernels, which use an unconditional load followed by a `v_cndmask` select.

  ### Expected impact

  Eliminates ALU overhead from OOB validity branches which reduces the kernel execution time, especially for memory-bound cases.

---------

Co-authored-by: Bartłomiej Kocot <barkocot@amd.com>
This commit is contained in:
jakpiase
2026-05-21 12:06:40 +02:00
committed by GitHub
parent 9cf49cd322
commit 309d823056

View File

@@ -25,17 +25,17 @@ if(GPU_TARGETS MATCHES "gfx9")
add_instance_library(device_grouped_conv_fwd_tile_instances ${GROUPED_CONV_FWD_TILE})
target_include_directories(device_grouped_conv_fwd_tile_instances PRIVATE
"${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
target_compile_options(device_grouped_conv_fwd_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1)
file(GLOB_RECURSE GROUPED_CONV_BWD_WEIGHT_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_weight/*.cpp")
add_instance_library(device_grouped_conv_bwd_weight_tile_instances ${GROUPED_CONV_BWD_WEIGHT_TILE})
target_include_directories(device_grouped_conv_bwd_weight_tile_instances PRIVATE
"${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
target_compile_options(device_grouped_conv_bwd_weight_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1)
file(GLOB_RECURSE GROUPED_CONV_BWD_DATA_TILE "${CMAKE_CURRENT_BINARY_DIR}/backward_data/*.cpp")
add_instance_library(device_grouped_conv_bwd_data_tile_instances ${GROUPED_CONV_BWD_DATA_TILE})
target_include_directories(device_grouped_conv_bwd_data_tile_instances PRIVATE
"${PROJECT_SOURCE_DIR}/experimental/builder/test/utils")
target_compile_options(device_grouped_conv_bwd_data_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0)
target_compile_options(device_grouped_conv_bwd_data_tile_instances PRIVATE -DCK_TILE_FLOAT_TO_BFLOAT16_DEFAULT=0 -DCK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK=1)
endif()