diff --git a/include/ck/utility/amd_buffer_addressing_builtins.hpp b/include/ck/utility/amd_buffer_addressing_builtins.hpp
index 1836e9461d..f642e06050 100644
--- a/include/ck/utility/amd_buffer_addressing_builtins.hpp
+++ b/include/ck/utility/amd_buffer_addressing_builtins.hpp
@@ -402,7 +402,7 @@ __device__ void amd_global_atomic_add_impl(const typename vector_type<T, N>::typ
                                                       tmp.template AsType<half2_t>()[i]);
         });
     }
-#if defined(__gfx942__) || defined(__gfx950__)
+#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
     else if constexpr(is_same<T, bhalf_t>::value)
     {
         vector_type<bhalf_t, N> tmp{src_thread_data};
@@ -838,10 +838,18 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                               const bool is_valid,
                                               const index_t src_element_space_size)
 {
-    // Direct loads require that each thread reads and writes exactly a single DWORD.
-    constexpr auto dword_bytes      = 4;
+    // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
+    // For gfx950: supports 1, 3, or 4 DWORDs per thread
+    // For gfx942: supports exactly 1 DWORD per thread
     constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
+#if defined(__gfx950__)
+    constexpr auto dword_bytes = 4;
+    static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
+                  bytes_per_thread == dword_bytes * 4);
+#elif defined(__gfx942__)
+    constexpr auto dword_bytes = 4;
     static_assert(bytes_per_thread == dword_bytes);
+#endif
 
     const int32x4_t src_resource =
         make_wave_buffer_resource(global_base_ptr, src_element_space_size);
@@ -872,7 +880,7 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
 #endif
 
     llvm_amdgcn_raw_buffer_load_lds(
-        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
+        src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
 #endif
 }
 #endif