From 7a9fe055cb69ab2de605a0cf7dbb33f27833f7f3 Mon Sep 17 00:00:00 2001 From: Blake Ledden <47259830+bledden@users.noreply.github.com> Date: Fri, 24 Apr 2026 20:27:40 -0700 Subject: [PATCH] fix: Add missing kElementsPerAccess division in RegularTileIterator store (#3049) The store(frag, tile_offset) method was computing the pointer offset without dividing by kElementsPerAccess, while the matching load(frag, tile_offset) method does include this division. Both load_with_pointer_offset and store_with_pointer_offset apply the same byte conversion, so the tile_offset -> pointer_offset calculation must also match. When kElementsPerAccess > 1, this caused load and store to reference different memory locations for the same logical tile offset. Fixes #3017 Signed-off-by: Blake Ledden --- .../transform/threadblock/regular_tile_iterator_pitch_linear.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h index 1485b1533..4a08ba9e3 100644 --- a/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h +++ b/include/cutlass/transform/threadblock/regular_tile_iterator_pitch_linear.h @@ -204,7 +204,8 @@ public: void store(Fragment const &frag, TensorCoord const & tile_offset) { store_with_pointer_offset( frag, - tile_offset.contiguous() * Shape::kContiguous + tile_offset.strided() * Shape::kStrided * stride_ + tile_offset.contiguous() * Shape::kContiguous / ThreadMap::kElementsPerAccess + + tile_offset.strided() * Shape::kStrided * stride_ ); }