From ae6bccf341fb4410241f696ba06873023d5ce4ed Mon Sep 17 00:00:00 2001 From: questa-quan-wang Date: Thu, 7 May 2026 15:03:37 +0800 Subject: [PATCH] [CuTeDSL] Update atomic_max_float32 to atomic_fmax in blockscaled GEMM example (#3206) The internal DSL package refactored atomic_max_float32 to atomic_fmax, which properly handles negative floats via sign-bit-aware integer atomics. Update the example to use the new API so it works with current DSL wheels. Co-authored-by: Questa Wang --- .../dense_blockscaled_gemm_persistent_amax.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/python/CuTeDSL/cute/blackwell/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent_amax.py b/examples/python/CuTeDSL/cute/blackwell/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent_amax.py index c4c35f44a..d076903ce 100644 --- a/examples/python/CuTeDSL/cute/blackwell/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent_amax.py +++ b/examples/python/CuTeDSL/cute/blackwell/kernel/blockscaled_gemm/dense_blockscaled_gemm_persistent_amax.py @@ -1423,9 +1423,8 @@ class Sm100BlockScaledPersistentDenseGemmKernel: # Global atomic max (accumulates across all tiles for final tensor amax) # Since we compute absolute values, all values are non-negative - # Use wrapper function for atomic max operation - _ = cute.arch.atomic_max_float32( - ptr=mAmax.iterator.llvm_ptr, value=block_amax + _ = cute.arch.atomic_fmax( + mAmax.iterator, block_amax, sign_bit=False ) # # Async arrive accumulator buffer empty