[rocm-libraries] ROCm/rocm-libraries#5842 (commit 04c5690)

[CK][CK Tile] Force padding for atomic_add bf16 C tensor
 (#5842)

## Motivation

Force padding for atomic_add bf16 C tensor to avoid memfaults.

## Technical Details

- add global atomic add for bf16 and enable them
- add padding for atomic add bf16 due to the lack of oob
- remove padding for not continous dims in conv for other cases
- minor bwd data conv fixes

## Test Plan

test_grouped_conv_*_tile

## Test Result

pending

## Submission Checklist

- [x] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
Bartłomiej Kocot
2026-03-31 08:03:41 +00:00
committed by assistant-librarian[bot]
parent 66dc81d530
commit ef4ff4667d
7 changed files with 174 additions and 171 deletions

View File

@@ -1021,6 +1021,11 @@ struct UniversalGemmKernel
const auto& e_tensor_view =
make_tensor_view<address_space_enum::global, DstInMemOp>(e_ptr, e_desc);
// For bf16_t and atomic_add global_atomic_add is used instead of buffer_atomic_add
// Add padding for not contiguous dim due to the lack of OOB check
constexpr bool pad_not_contiguous_dim =
std::is_same_v<EDataType, bf16_t> && DstInMemOp == memory_operation_enum::atomic_add;
// Step 2: Create padded view
const auto& e_pad_view = [&]() {
if constexpr(std::is_same_v<CLayout, tensor_layout::gemm::RowMajor>)
@@ -1028,14 +1033,14 @@ struct UniversalGemmKernel
return pad_tensor_view(e_tensor_view,
make_tuple(number<TilePartitioner::MPerBlock>{},
number<TilePartitioner::NPerBlock>{}),
sequence<false, GemmPipeline::kPadN>{});
sequence<pad_not_contiguous_dim, GemmPipeline::kPadN>{});
}
else
{
return pad_tensor_view(e_tensor_view,
make_tuple(number<TilePartitioner::MPerBlock>{},
number<TilePartitioner::NPerBlock>{}),
sequence<GemmPipeline::kPadM, false>{});
sequence<GemmPipeline::kPadM, pad_not_contiguous_dim>{});
}
}();