diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp index 0ec1a95511..12f49aa4e3 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp @@ -2784,10 +2784,13 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, #endif } +#if defined(__gfx950__) template __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr) { + static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32), + "We need to have the compatible compiler version to build this instruction"); if constexpr(std::is_same_v, ck_tile::half_t>) { typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t; @@ -2817,6 +2820,7 @@ __device__ auto amd_transpose_load_to_vgpr(const T* in_ptr) static_assert(false, "not implemented"); } } +#endif } // namespace ck_tile diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp index 53a344c7b0..306d2cdac3 100644 --- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp +++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp @@ -2554,6 +2554,44 @@ CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr, #endif } +#if defined(__gfx950__) +template +__device__ auto amd_transpose_load_to_vgpr(const T* in_ptr) +{ + + static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32), + "We need to have the compatible compiler version to build this instruction"); + if constexpr(std::is_same_v, ck_tile::half_t>) + { + typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t; + __attribute__((address_space(3))) llvm_fp16x4_t* lds_ptr = + reinterpret_cast<__attribute__((address_space(3))) llvm_fp16x4_t*>( + reinterpret_cast(in_ptr)); + return bit_cast>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr)); + } + else if constexpr(std::is_same_v, ck_tile::bf16_t>) + { + typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t; + __attribute__((address_space(3))) llvm_bf16x4_t* lds_ptr = + reinterpret_cast<__attribute__((address_space(3))) llvm_bf16x4_t*>( + reinterpret_cast(in_ptr)); + return bit_cast>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr)); + } + else if constexpr(std::is_same_v, ck_tile::fp8_t>) + { + typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_fp8x8_t; + __attribute__((address_space(3))) llvm_fp8x8_t* lds_ptr = + reinterpret_cast<__attribute__((address_space(3))) llvm_fp8x8_t*>( + reinterpret_cast(in_ptr)); + return bit_cast>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr)); + } + else + { + static_assert(false, "not implemented"); + } +} +#endif + } // namespace ck_tile #endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index cd7b7d0a1f..8d19337b86 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -902,8 +902,9 @@ struct buffer_view>::scalar_type, typename vector_traits>::scalar_type>::value, bool>::type = false> - CK_TILE_DEVICE constexpr auto - transpose_get(index_t i, index_t linear_offset, bool is_valid_element) const + CK_TILE_DEVICE constexpr auto transpose_get([[maybe_unused]] index_t i, + [[maybe_unused]] index_t linear_offset, + bool is_valid_element) const { // X contains multiple T constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; @@ -913,13 +914,16 @@ struct buffer_view, t_per_x, addr_space>( p_data_ + i + linear_offset); +#else + return X{numeric>::zero()}; +#endif } else {