From ac62d13ecdf1d959e62582a6bce017d6e4f90692 Mon Sep 17 00:00:00 2001 From: Chao Liu Date: Wed, 29 Jul 2020 18:04:09 -0500 Subject: [PATCH] Improve buffer address for out of bound check (#21) * Use buffer load built-in OOB check. buffer size is limited to 2GB. * buffer APIs use combined wave and thread offset * use uint32_t for addr shift in buffer addressing --- ...tion_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp | 8 +- .../blockwise_generic_tensor_slice_copy.hpp | 27 +- .../threadwise_generic_tensor_slice_copy.hpp | 367 +----- .../include/utility/amd_buffer_addressing.hpp | 1123 +++++++++++------ .../include/utility/config.amd.hpp.in | 11 +- .../utility/in_memory_operation.amd.hpp.in | 125 +- 6 files changed, 872 insertions(+), 789 deletions(-) diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index ae68f4486e..5e4f621807 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -110,14 +110,14 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2, 3>{}, Sequence<4, 5>{})); - constexpr auto in_gemmm_gemmn_global_desc = transform_tensor_descriptor( + constexpr auto in_gemmk_gemmn_global_desc = transform_tensor_descriptor( in_n_c_y_ho_x_wo_global_desc, make_tuple(Merge>{}, Merge>{}), make_tuple(Sequence<1, 2, 4>{}, Sequence<0, 3, 5>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); // output tensor - constexpr auto out_gemmk_gemmn_global_desc = + constexpr auto out_gemmm_gemmn_global_desc = transform_tensor_descriptor(unfold_tensor_descriptor(out_n_k_ho_wo_global_desc, I2, I3), make_tuple(PassThrough{}, Merge>{}), make_tuple(Sequence<1>{}, Sequence<0, 2>{}), @@ -130,8 +130,8 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw Float, AccFloat, decltype(wei_gemmk_gemmm_global_desc), - decltype(in_gemmm_gemmn_global_desc), - decltype(out_gemmk_gemmn_global_desc), + decltype(in_gemmk_gemmn_global_desc), + decltype(out_gemmm_gemmn_global_desc), InMemoryDataOperation::Set, GemmMPerBlock, GemmNPerBlock, diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 39c1fb86fa..a63ebd27bc 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -84,21 +84,10 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src, ThreadBufferData* p_thread_buffer) const { - constexpr bool has_optimized_address_calculation = - decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation(); - if(BlockSize == mThreadClusterDesc.GetElementSize() or get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize()) { - // TODO: threadwise copy is still being tweaked - if(has_optimized_address_calculation) - { - mThreadwiseLoad.Run_optimized_src_address_calculation(p_block_src, p_thread_buffer); - } - else - { - mThreadwiseLoad.Run(p_block_src, p_thread_buffer); - } + mThreadwiseLoad.Run(p_block_src, p_thread_buffer); } } @@ -106,22 +95,10 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer, BlockDstData* p_block_dst) const { - constexpr bool has_optimized_address_calculation = - decltype(mThreadwiseStore)::HasWorkingOptimizedAddressCalculation(); - if(BlockSize == mThreadClusterDesc.GetElementSize() or get_thread_local_1d_id() < mThreadClusterDesc.GetElementSize()) { - // TODO: threadwise copy is still being tweaked - if(has_optimized_address_calculation) - { - mThreadwiseStore.Run_optimized_dst_address_calculation(p_thread_buffer, - p_block_dst); - } - else - { - mThreadwiseStore.Run(p_thread_buffer, p_block_dst); - } + mThreadwiseStore.Run(p_thread_buffer, p_block_dst); } } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index 2dd4a79912..db6660a3cb 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -93,11 +93,13 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // buffer to hold a src long-vector SrcData p_src_long_vector[long_vector_size]; +#if 1 // zero out buffer for(index_t i = 0; i < long_vector_size; ++i) { p_src_long_vector[i] = 0; } +#endif // load data from src to the long-vector buffer for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) @@ -112,17 +114,20 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Check src data's valid mapping situation, only check the first data in this src // vector. It's user's responsiblity to make sure all data in the src vector // has the valid/invalid mapping situation - if(src_coord.IsOffsetValidAssumingUpperIndexIsValid()) - { - transfer_data( - p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); - } + transfer_data(p_src, + src_coord.GetOffset(), + src_coord.IsOffsetValidAssumingUpperIndexIsValid(), + SrcDesc::GetElementSpace(), + p_src_long_vector, + buffer_offset, + true, + long_vector_size); } // SrcData to DstData conversion @@ -146,336 +151,24 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // Check dst data's valid mapping situation, only check the first data in this dst // vector. It's user's responsiblity to make sure all data in the dst vector // has the valid/invalid mapping situation - if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid()) - { - transfer_data( - p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); - } + transfer_data(p_dst_long_vector, + buffer_offset, + true, + long_vector_size, + p_dst, + dst_coord.GetOffset(), + dst_coord.IsOffsetValidAssumingUpperIndexIsValid(), + DstDesc::GetElementSpace()); } }); } - // Modify Length to 1, if Mask is set to false - // Used for isolating linear dimension from non-linear dimensions - template - __device__ static constexpr auto mask_lengths(Sequence, Sequence) - { - return Sequence<(Mask ? Lengths : 1)...>{}; - } - - // Will do valid mapping check on src data: Read 0 if src data has a invalid mapping - // Will do valid mapping check on dst data: No write if dst data has a invalid mapping - // This version is optimized for address calculation of src tensor - // TODO: this function is not compiled to expected ISA - template - __device__ void Run_optimized_src_address_calculation(const SrcData* p_src, - DstData* p_dst) const - { - constexpr auto vector_access_dim = Number{}; - - constexpr auto src_data_per_access = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto long_vector_size = Number{}; - - constexpr auto long_vector_access_lengths = SliceLengths::Modify( - vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - - // separate linear dimensions from non-linear dimensions - constexpr auto src_linear_dim_mask = SrcDesc::GetLinearDimensionMask(); - constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask(); - - static_assert( - src_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == SrcDataPerRead, - "Warning! SrcDstVectorReadWriteDim is not SrcDesc's linear dimension, performance " - "would drop"); - - // separate steps into linear and non-linear components, accoording to src tensor - constexpr auto linear_long_vector_access_lengths = - mask_lengths(long_vector_access_lengths, src_linear_dim_mask); - - constexpr auto nonlinear_long_vector_access_lengths = - mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask); - - // loop over src's non-linear dimensions - ford{}([&]( - auto nonlinear_dim_long_vector_access_id) { - - // calculate step-sizes along src's nonlinear dimensions - auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id; - nonlinear_dim_data_steps(vector_access_dim) = - long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim]; - - // move src cooridnate along nonlinear dimensions - // this coordinate contains run-time per-thread offset - const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps; - - // loop over src's linear dimensions - ford{}([&]( - auto linear_dim_long_vector_access_id) { - - // step-sizes along src's linear dimensions - auto linear_dim_data_steps = linear_dim_long_vector_access_id; - linear_dim_data_steps(vector_access_dim) = - long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; - - // buffer to hold a long-vector - SrcData p_src_long_vector[long_vector_size]; - - // zero out buffer - for(index_t i = 0; i < long_vector_size; ++i) - { - p_src_long_vector[i] = 0; - } - - // Loop over SrcDstVectorReadWriteDim, and load data from src to the - // long-vector buffer. - // If SrcDstVectorReadWriteDim is src's linear dimension, then src's - // offset-diff due to this looping is known at compile-time. If - // SrcDstVectorReadWriteDim is src's nonlinear dimension, then src's - // offset-diff due to this looping is only known at run-time. For best - // performance, SrcDstVectorReadWriteDim, should be src's linear dimension - for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * src_data_per_access; - - const index_t buffer_offset = i * src_data_per_access; - - // move src cooridnate along linear dimensions - const auto src_coord = - src_nonlinear_coord + (linear_dim_data_steps + scalar_id); - -#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking - // this is src compile-time offset - const index_t src_linear_offset = - src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); -#else - // this is src compile-time offset - const index_t src_linear_offset = - src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); -#endif - - // Check src data's valid mapping situation, only check the first data in this - // src - // vector. It's user's responsiblity to make sure all data in the src vector - // has the valid/invalid mapping situation - if(src_coord.IsOffsetValidAssumingUpperIndexIsValid()) - { - transfer_data(p_src, - src_nonlinear_coord.GetOffset() + - src_linear_offset, - p_src_long_vector, - buffer_offset); - } - } - - // SrcData to DstData conversion - DstData p_dst_long_vector[long_vector_size]; - - for(index_t i = 0; i < long_vector_size; ++i) - { - p_dst_long_vector[i] = type_convert{}(p_src_long_vector[i]); - } - - // store data from the long-vector buffer to dst - for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * dst_data_per_access; - - const index_t buffer_offset = i * dst_data_per_access; - - // dst offset is calculated here, without explicitly separating into - // compile-time and per-thread component - const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps + - linear_dim_data_steps + scalar_id); - - // Check dst data's valid mapping situation, only check the first data in this - // dst - // vector. It's user's responsiblity to make sure all data in the dst vector - // has the valid/invalid mapping situation - if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid()) - { - transfer_data( - p_dst_long_vector, buffer_offset, p_dst, dst_coord.GetOffset()); - } - } - }); - }); - } - - // This version is optimized for address calculation of dst tensor - // TODO: this function is not compiled to expected ISA - template - __device__ void Run_optimized_dst_address_calculation(const SrcData* p_src, - DstData* p_dst) const - { - constexpr auto vector_access_dim = Number{}; - - constexpr auto src_data_per_access = Number{}; - constexpr auto dst_data_per_access = Number{}; - - constexpr auto long_vector_size = Number{}; - - constexpr auto long_vector_access_lengths = SliceLengths::Modify( - vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - - // separate linear dimensions from non-linear dimensions - constexpr auto dst_linear_dim_mask = DstDesc::GetLinearDimensionMask(); - constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask(); - - static_assert( - dst_linear_dim_mask.At(SrcDstVectorReadWriteDim) || long_vector_size == DstDataPerWrite, - "Warning! SrcDstVectorReadWriteDim is not DstDesc's linear dimension, performance " - "would drop"); - - // separate steps into linear and non-linear components, accoording to dst tensor - constexpr auto linear_long_vector_access_lengths = - mask_lengths(long_vector_access_lengths, dst_linear_dim_mask); - - constexpr auto nonlinear_long_vector_access_lengths = - mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask); - - // loop over dst's non-linear dimensions - ford{}([&]( - auto nonlinear_dim_long_vector_access_id) { - - // calculate step-sizes along dst's nonlinear dimensions - auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id; - nonlinear_dim_data_steps(vector_access_dim) = - long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim]; - - // move dst cooridnate along nonlinear dimensions - // this coordinate contains run-time per-thread offset - const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps; - - // loop over dst's linear dimensions - ford{}([&]( - auto linear_dim_long_vector_access_id) { - - // step-sizes along dst's linear dimensions - auto linear_dim_data_steps = linear_dim_long_vector_access_id; - linear_dim_data_steps(vector_access_dim) = - long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; - - // buffer to hold a long-vector - SrcData p_src_long_vector[long_vector_size]; - - // zero out buffer - for(index_t i = 0; i < long_vector_size; ++i) - { - p_src_long_vector[i] = 0; - } - - // Loop over SrcDstVectorReadWriteDim, and load data from src to the - // long-vector buffer. - // If SrcDstVectorReadWriteDim is dst's linear dimension, then dst's - // offset-diff due to this looping is known at compile-time. If - // SrcDstVectorReadWriteDim is dst's nonlinear dimension, then dst's - // offset-diff due to this looping is only known at run-time. For best - // performance, SrcDstVectorReadWriteDim, should be dst's linear dimension - for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * src_data_per_access; - - const index_t buffer_offset = i * src_data_per_access; - - // src offset is calculated here, without explicitly separating into - // compile-time and per-thread component - const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps + - linear_dim_data_steps + scalar_id); - - // Check src data's valid mapping situation, only check the first data in this - // src - // vector. It's user's responsiblity to make sure all data in the src vector - // has the valid/invalid mapping situation - if(src_coord.IsOffsetValidAssumingUpperIndexIsValid()) - { - transfer_data( - p_src, src_coord.GetOffset(), p_src_long_vector, buffer_offset); - } - } - - // SrcData to DstData conversion - DstData p_dst_long_vector[long_vector_size]; - - for(index_t i = 0; i < long_vector_size; ++i) - { - p_dst_long_vector[i] = type_convert{}(p_src_long_vector[i]); - } - - // store data from the long-vector buffer to dst - for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * dst_data_per_access; - - const index_t buffer_offset = i * dst_data_per_access; - - // move dst cooridnate along linear dimensions - const auto dst_coord = - dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); - -#if CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF // tweaking - // this is dst compile-time offset - const index_t dst_linear_offset = - dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id); -#else - // this is dst compile-time offset - const index_t dst_linear_offset = - dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); -#endif - - // Check dst data's valid mapping situation, only check the first data in this - // dst - // vector. It's user's responsiblity to make sure all data in the dst vector - // has the valid/invalid mapping situation - if(dst_coord.IsOffsetValidAssumingUpperIndexIsValid()) - { - transfer_data(p_dst_long_vector, - buffer_offset, - p_dst, - dst_nonlinear_coord.GetOffset() + - dst_linear_offset); - } - } - }); - }); - } - - __device__ static constexpr bool HasWorkingOptimizedAddressCalculation() - { -#if CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION // tweaking - return true; -#else - return false; -#endif - } - template __device__ void MoveSrcSliceWindow(const T& step_sizes_, integral_constant) diff --git a/composable_kernel/include/utility/amd_buffer_addressing.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp index a308e710f9..9176241bfc 100644 --- a/composable_kernel/include/utility/amd_buffer_addressing.hpp +++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp @@ -5,118 +5,119 @@ namespace ck { -// For 128bit SGPRs in buffer_load and buffer_store instructions +// For 128 bit SGPRs to supply resource constant in buffer instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions template -union BufferAddressConfig +union BufferResourceConstant { int32x4_t data; T* address[2]; int32_t range[4]; + int32_t config[4]; }; -__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t rsrc, +__device__ float __llvm_amdgcn_buffer_load_f32(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.f32"); __device__ float2_t -__llvm_amdgcn_buffer_load_f32x2(int32x4_t rsrc, +__llvm_amdgcn_buffer_load_f32x2(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.v2f32"); __device__ float4_t -__llvm_amdgcn_buffer_load_f32x4(int32x4_t rsrc, +__llvm_amdgcn_buffer_load_f32x4(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.v4f32"); -__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t rsrc, +__device__ half_t __llvm_amdgcn_buffer_load_f16(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.f16"); -__device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t rsrc, +__device__ half2_t __llvm_amdgcn_buffer_load_f16x2(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.v2f16"); -__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t rsrc, +__device__ half4_t __llvm_amdgcn_buffer_load_f16x4(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.v4f16"); -__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t rsrc, +__device__ ushort __llvm_amdgcn_buffer_load_bf16(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.bf16"); __device__ ushort2_t -__llvm_amdgcn_buffer_load_bf16x2(int32x4_t rsrc, +__llvm_amdgcn_buffer_load_bf16x2(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.v2bf16"); __device__ ushort4_t -__llvm_amdgcn_buffer_load_bf16x4(int32x4_t rsrc, +__llvm_amdgcn_buffer_load_bf16x4(int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.load.v4bf16"); __device__ void __llvm_amdgcn_buffer_store_f32(float vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.f32"); __device__ void __llvm_amdgcn_buffer_store_f32x2(float2_t vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.v2f32"); __device__ void __llvm_amdgcn_buffer_store_f32x4(float4_t vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.v4f32"); __device__ void __llvm_amdgcn_buffer_store_f16(half_t vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.f16"); __device__ void __llvm_amdgcn_buffer_store_f16x2(half2_t vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.v2f16"); __device__ void __llvm_amdgcn_buffer_store_f16x4(half4_t vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, bool slc) __asm("llvm.amdgcn.buffer.store.v4f16"); __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, @@ -124,7 +125,7 @@ __device__ void __llvm_amdgcn_buffer_store_bf16(ushort vdata, __device__ void __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, @@ -132,7 +133,7 @@ __llvm_amdgcn_buffer_store_bf16x2(ushort2_t vdata, __device__ void __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool glc, @@ -140,646 +141,986 @@ __llvm_amdgcn_buffer_store_bf16x4(ushort4_t vdata, __device__ void __llvm_amdgcn_buffer_atomic_add_f32(float vdata, - int32x4_t rsrc, + int32x4_t srsrc, index_t vindex, index_t offset, bool slc) __asm("llvm.amdgcn.buffer.atomic.fadd.f32"); // buffer_load requires: -// 1) p_src must be in global memory space, d_dst must be vgpr -// 2) p_src to be a block-invariant pointer. +// 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr +// 2) p_src_thread to be a wavewise pointer. // It is user's responsibility to make sure that is true. template -__device__ typename vector_type::MemoryType amd_buffer_load( - const T* p_src_block, index_t src_thread_data_offset, index_t src_const_data_offset); +__device__ typename vector_type::MemoryType +amd_buffer_load(const T* p_src_wave, + index_t src_thread_data_offset, + bool src_thread_data_valid, + index_t src_elemenst_space); // buffer_store requires: -// 1) p_src must be in vgpr space, d_dst must be global memory -// 2) p_dst to be a block-invariant pointer. +// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory +// 2) p_dst_thread to be a wavewise pointer. // It is user's responsibility to make sure that is true. template -__device__ void amd_buffer_store(const T* p_src, - T* p_dst_block, +__device__ void amd_buffer_store(const T* p_src_thread, + T* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset); + bool dst_thread_data_valid, + index_t dst_data_range); +// buffer_atomic requires: +// 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory +// 2) p_dst_thread to be a wavewise pointer. +// It is user's responsibility to make sure that is true. template -__device__ void amd_buffer_atomic_add(const T* p_src, - T* p_dst_block, +__device__ void amd_buffer_atomic_add(const T* p_src_thread, + T* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset); + bool dst_thread_data_valid, + index_t dst_data_range); template <> -__device__ float amd_buffer_load(const float* p_src_block, +__device__ float amd_buffer_load(const float* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(float); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); + +#if 1 // debug +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + return __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset + : 0xffffffff, + false, + false); +#else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; return __llvm_amdgcn_buffer_load_f32( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif +#else + return src_thread_data_valid + ? __llvm_amdgcn_buffer_load_f32( + src_wave_buffer_resource.data, 0, src_thread_addr_offset, false, false) + : 0; +#endif } template <> -__device__ float2_t amd_buffer_load(const float* p_src_block, +__device__ float2_t amd_buffer_load(const float* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(float); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + return __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset + : 0xffffffff, + false, + false); +#else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; return __llvm_amdgcn_buffer_load_f32x2( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif } template <> -__device__ float4_t amd_buffer_load(const float* p_src_block, +__device__ float4_t amd_buffer_load(const float* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(float); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(float); - index_t src_const_addr_offset = src_const_data_offset * sizeof(float); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + return __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset + : 0xffffffff, + false, + false); +#else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; return __llvm_amdgcn_buffer_load_f32x4( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif } template <> -__device__ half_t amd_buffer_load(const half_t* p_src_block, +__device__ half_t amd_buffer_load(const half_t* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; #if !CK_WORKAROUND_SWDEV_231101 index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); - index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + return __llvm_amdgcn_buffer_load_f16(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset + : 0xffffffff, + false, + false); +#else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; return __llvm_amdgcn_buffer_load_f16( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif #else - return p_src_block[src_thread_data_offset + src_const_data_offset]; + return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0; #endif } template <> -__device__ half2_t amd_buffer_load(const half_t* p_src_block, +__device__ half2_t amd_buffer_load(const half_t* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); - index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); -#if !CK_WORKAROUND_SWDEV_231101 - return __llvm_amdgcn_buffer_load_f16x2( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + float dst_out_tmp = + __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset : 0xffffffff, + false, + false); #else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; + float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif return *reinterpret_cast(&dst_out_tmp); -#endif } template <> -__device__ half4_t amd_buffer_load(const half_t* p_src_block, +__device__ half4_t amd_buffer_load(const half_t* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); - index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); -#if !CK_WORKAROUND_SWDEV_231101 - return __llvm_amdgcn_buffer_load_f16x4( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + float2_t dst_out_tmp = + __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset : 0xffffffff, + false, + false); #else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; + float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif return *reinterpret_cast(&dst_out_tmp); -#endif } template <> -__device__ half8_t amd_buffer_load(const half_t* p_src_block, +__device__ half8_t amd_buffer_load(const half_t* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(half_t); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(half_t); - index_t src_const_addr_offset = src_const_data_offset * sizeof(half_t); -#if !CK_WORKAROUND_SWDEV_231101 - static_assert(false, "wrong! not supported"); +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + float4_t dst_out_tmp = + __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset : 0xffffffff, + false, + false); #else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; + float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif return *reinterpret_cast(&dst_out_tmp); -#endif } template <> -__device__ ushort amd_buffer_load(const ushort* p_src_block, +__device__ ushort amd_buffer_load(const ushort* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; #if !CK_WORKAROUND_SWDEV_231101 index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); - index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + return __llvm_amdgcn_buffer_load_bf16(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset + : 0xffffffff, + false, + false); +#else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; return __llvm_amdgcn_buffer_load_bf16( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif + #else - return p_src_block[src_thread_data_offset + src_const_data_offset]; + return src_thread_data_valid ? p_src_wave[src_thread_data_offset] : 0; #endif } template <> -__device__ ushort2_t amd_buffer_load(const ushort* p_src_block, +__device__ ushort2_t amd_buffer_load(const ushort* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); - index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); -#if !CK_WORKAROUND_SWDEV_231101 - return __llvm_amdgcn_buffer_load_bf16x2( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + float dst_out_tmp = + __llvm_amdgcn_buffer_load_f32(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset : 0xffffffff, + false, + false); #else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; + float dst_out_tmp = __llvm_amdgcn_buffer_load_f32( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif return *reinterpret_cast(&dst_out_tmp); -#endif } template <> -__device__ ushort4_t amd_buffer_load(const ushort* p_src_block, +__device__ ushort4_t amd_buffer_load(const ushort* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); - index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); -#if !CK_WORKAROUND_SWDEV_231101 - return __llvm_amdgcn_buffer_load_bf16x4( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + float2_t dst_out_tmp = + __llvm_amdgcn_buffer_load_f32x2(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset : 0xffffffff, + false, + false); #else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; + float2_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x2( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif return *reinterpret_cast(&dst_out_tmp); -#endif } template <> -__device__ ushort8_t amd_buffer_load(const ushort* p_src_block, +__device__ ushort8_t amd_buffer_load(const ushort* p_src_wave, index_t src_thread_data_offset, - index_t src_const_data_offset) + bool src_thread_data_valid, + index_t src_data_range) { - BufferAddressConfig src_block_config; + BufferResourceConstant src_wave_buffer_resource; - // fill in byte 0 - 1 - src_block_config.address[0] = const_cast(p_src_block); - // fill in byte 2 - src_block_config.range[2] = -1; - // fill in byte 3 - src_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + src_wave_buffer_resource.address[0] = const_cast(p_src_wave); + // wavewise range (32 bit) + src_wave_buffer_resource.range[2] = src_data_range * sizeof(ushort); + // wavewise setting (32 bit) + src_wave_buffer_resource.config[3] = 0x00027000; index_t src_thread_addr_offset = src_thread_data_offset * sizeof(ushort); - index_t src_const_addr_offset = src_const_data_offset * sizeof(ushort); -#if !CK_WORKAROUND_SWDEV_231101 - static_assert(false, "wrong! not implemented"); +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + float4_t dst_out_tmp = + __llvm_amdgcn_buffer_load_f32x4(src_wave_buffer_resource.data, + 0, + src_thread_data_valid ? src_thread_addr_offset : 0xffffffff, + false, + false); #else + uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; + float4_t dst_out_tmp = __llvm_amdgcn_buffer_load_f32x4( - src_block_config.data, 0, src_thread_addr_offset + src_const_addr_offset, false, false); + src_wave_buffer_resource.data, 0, src_addr_shift + src_thread_addr_offset, false, false); +#endif return *reinterpret_cast(&dst_out_tmp); +} + +template <> +__device__ void amd_buffer_store(const float* p_src_thread, + float* p_dst_wave, + index_t dst_thread_data_offset, + bool dst_thread_data_valid, + index_t dst_data_range) +{ + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + +#if 1 // debug +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32(*p_src_thread, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false, + false); +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_store_f32(*p_src_thread, + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false, + false); +#endif +#else + if(dst_thread_data_valid) + { + __llvm_amdgcn_buffer_store_f32( + *p_src_thread, dst_wave_buffer_resource.data, 0, dst_thread_addr_offset, false, false); + } #endif } template <> -__device__ void amd_buffer_store(const float* p_src, - float* p_dst_block, +__device__ void amd_buffer_store(const float* p_src_thread, + float* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset) + bool dst_thread_data_valid, + index_t dst_data_range) { - BufferAddressConfig dst_block_config; + BufferResourceConstant dst_wave_buffer_resource; - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - __llvm_amdgcn_buffer_store_f32(*p_src, - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); -} - -template <> -__device__ void amd_buffer_store(const float* p_src, - float* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) -{ - BufferAddressConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast(p_src), - dst_block_config.data, +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast(p_src_thread), + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, false, false); -} - -template <> -__device__ void amd_buffer_store(const float* p_src, - float* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) -{ - BufferAddressConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - - __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast(p_src), - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); -} - -template <> -__device__ void amd_buffer_store(const half_t* p_src, - half_t* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) -{ - BufferAddressConfig dst_block_config; - - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - -#if !CK_WORKAROUND_SWDEV_231101 - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t); - - __llvm_amdgcn_buffer_store_f16(*p_src, - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); #else - p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src; + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_store_f32x2(*reinterpret_cast(p_src_thread), + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false, + false); #endif } template <> -__device__ void amd_buffer_store(const half_t* p_src, - half_t* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) +__device__ void amd_buffer_store(const float* p_src_thread, + float* p_dst_wave, + index_t dst_thread_data_offset, + bool dst_thread_data_valid, + index_t dst_data_range) { - BufferAddressConfig dst_block_config; + BufferResourceConstant dst_wave_buffer_resource; - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t); + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); -#if !CK_WORKAROUND_SWDEV_231101 - __llvm_amdgcn_buffer_store_f16x2(*reinterpret_cast(p_src), - dst_block_config.data, +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast(p_src_thread), + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, false, false); #else - const float* p_src_tmp = reinterpret_cast(p_src); + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_store_f32x4(*reinterpret_cast(p_src_thread), + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false, + false); +#endif +} + +template <> +__device__ void amd_buffer_store(const half_t* p_src_thread, + half_t* p_dst_wave, + index_t dst_thread_data_offset, + bool dst_thread_data_valid, + index_t dst_data_range) +{ + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + +#if !CK_WORKAROUND_SWDEV_231101 + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f16(*p_src_thread, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false, + false); +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_store_f16(*p_src_thread, + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false, + false); +#endif + +#else + if(dst_thread_data_valid) + { + p_dst_wave[dst_thread_data_offset] = *p_src_thread; + } +#endif +} + +template <> +__device__ void amd_buffer_store(const half_t* p_src_thread, + half_t* p_dst_wave, + index_t dst_thread_data_offset, + bool dst_thread_data_valid, + index_t dst_data_range) +{ + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); + + const float* p_src_tmp = reinterpret_cast(p_src_thread); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32(*p_src_tmp, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false, + false); +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; __llvm_amdgcn_buffer_store_f32(*p_src_tmp, - dst_block_config.data, + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_addr_shift + dst_thread_addr_offset, false, false); #endif } template <> -__device__ void amd_buffer_store(const half_t* p_src, - half_t* p_dst_block, +__device__ void amd_buffer_store(const half_t* p_src_thread, + half_t* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset) + bool dst_thread_data_valid, + index_t dst_data_range) { + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(half_t); - BufferAddressConfig dst_block_config; + const float2_t* p_src_tmp = reinterpret_cast(p_src_thread); - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; - -#if !CK_WORKAROUND_SWDEV_231101 - __llvm_amdgcn_buffer_store_f16x4(*reinterpret_cast(p_src), - dst_block_config.data, +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, false, false); #else - const float2_t* p_src_tmp = reinterpret_cast(p_src); + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, - dst_block_config.data, + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_addr_shift + dst_thread_addr_offset, false, false); #endif } template <> -__device__ void amd_buffer_store(const ushort* p_src, - ushort* p_dst_block, +__device__ void amd_buffer_store(const half_t* p_src_thread, + half_t* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset) + bool dst_thread_data_valid, + index_t dst_data_range) { - BufferAddressConfig dst_block_config; + BufferResourceConstant dst_wave_buffer_resource; - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(half_t); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(half_t); + + const float4_t* p_src_tmp = reinterpret_cast(p_src_thread); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false, + false); +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false, + false); +#endif +} + +template <> +__device__ void amd_buffer_store(const ushort* p_src_thread, + ushort* p_dst_wave, + index_t dst_thread_data_offset, + bool dst_thread_data_valid, + index_t dst_data_range) +{ + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; #if !CK_WORKAROUND_SWDEV_231101 index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort); - __llvm_amdgcn_buffer_store_bf16(*p_src, - dst_block_config.data, +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_bf16(*p_src_thread, + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, false, false); #else - p_dst_block[dst_thread_data_offset + dst_const_data_offset] = *p_src; + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_store_bf16(*p_src_thread, + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false, + false); +#endif + +#else + if(dst_thread_data_valid) + { + p_dst_wave[dst_thread_data_offset] = *p_src_thread; + } #endif } template <> -__device__ void amd_buffer_store(const ushort* p_src, - ushort* p_dst_block, +__device__ void amd_buffer_store(const ushort* p_src_thread, + ushort* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset) + bool dst_thread_data_valid, + index_t dst_data_range) { - BufferAddressConfig dst_block_config; + BufferResourceConstant dst_wave_buffer_resource; - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort); -#if !CK_WORKAROUND_SWDEV_231101 - __llvm_amdgcn_buffer_store_bf16x2(*p_src, - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); + const float* p_src_tmp = reinterpret_cast(p_src_thread); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32(*p_src_tmp, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false, + false); #else - const float* p_src_tmp = reinterpret_cast(p_src); + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; __llvm_amdgcn_buffer_store_f32(*p_src_tmp, - dst_block_config.data, + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_addr_shift + dst_thread_addr_offset, false, false); #endif } template <> -__device__ void amd_buffer_store(const ushort* p_src, - ushort* p_dst_block, +__device__ void amd_buffer_store(const ushort* p_src_thread, + ushort* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset) + bool dst_thread_data_valid, + index_t dst_data_range) { - BufferAddressConfig dst_block_config; + BufferResourceConstant dst_wave_buffer_resource; - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(ushort); -#if !CK_WORKAROUND_SWDEV_231101 - __llvm_amdgcn_buffer_store_bf16x4(*p_src, - dst_block_config.data, - 0, - dst_thread_addr_offset + dst_const_addr_offset, - false, - false); + const float2_t* p_src_tmp = reinterpret_cast(p_src_thread); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false, + false); #else - const float2_t* p_src_tmp = reinterpret_cast(p_src); + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; __llvm_amdgcn_buffer_store_f32x2(*p_src_tmp, - dst_block_config.data, + dst_wave_buffer_resource.data, 0, - dst_thread_addr_offset + dst_const_addr_offset, + dst_addr_shift + dst_thread_addr_offset, false, false); #endif } template <> -__device__ void amd_buffer_atomic_add(const float* p_src, - float* p_dst_block, - index_t dst_thread_data_offset, - index_t dst_const_data_offset) +__device__ void amd_buffer_store(const ushort* p_src_thread, + ushort* p_dst_wave, + index_t dst_thread_data_offset, + bool dst_thread_data_valid, + index_t dst_data_range) { - BufferAddressConfig dst_block_config; + BufferResourceConstant dst_wave_buffer_resource; - // fill in byte 0 - 1 - dst_block_config.address[0] = p_dst_block; - // fill in byte 2 - dst_block_config.range[2] = -1; - // fill in byte 3 - dst_block_config.range[3] = 0x00027000; + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(ushort); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(ushort); + + const float4_t* p_src_tmp = reinterpret_cast(p_src_thread); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false, + false); +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_store_f32x4(*p_src_tmp, + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false, + false); +#endif +} + +template <> +__device__ void amd_buffer_atomic_add(const float* p_src_thread, + float* p_dst_wave, + index_t dst_thread_data_offset, + bool dst_thread_data_valid, + index_t dst_data_range) +{ + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); - index_t dst_const_addr_offset = dst_const_data_offset * sizeof(float); - __llvm_amdgcn_buffer_atomic_add_f32( - *p_src, dst_block_config.data, 0, dst_thread_addr_offset + dst_const_addr_offset, false); +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK + __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread, + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? dst_thread_addr_offset : 0xffffffff, + false); +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + __llvm_amdgcn_buffer_atomic_add_f32(*p_src_thread, + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset, + false); +#endif } template <> -__device__ void amd_buffer_atomic_add(const float* p_src, - float* p_dst_block, +__device__ void amd_buffer_atomic_add(const float* p_src_thread, + float* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset) + bool dst_thread_data_valid, + index_t dst_data_range) { + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range; + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK for(index_t i = 0; i < 2; ++i) { - amd_buffer_atomic_add( - &p_src[i], p_dst_block, dst_thread_data_offset, dst_const_data_offset + i); + __llvm_amdgcn_buffer_atomic_add_f32( + p_src_thread[i], + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff, + false); } +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + for(index_t i = 0; i < 2; ++i) + { + __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i], + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset + + i * sizeof(float), + false); + } +#endif } template <> -__device__ void amd_buffer_atomic_add(const float* p_src, - float* p_dst_block, +__device__ void amd_buffer_atomic_add(const float* p_src_thread, + float* p_dst_wave, index_t dst_thread_data_offset, - index_t dst_const_data_offset) + bool dst_thread_data_valid, + index_t dst_data_range) { + BufferResourceConstant dst_wave_buffer_resource; + + // wavewise base address (64 bit) + dst_wave_buffer_resource.address[0] = p_dst_wave; + // wavewise range (32 bit) + dst_wave_buffer_resource.range[2] = dst_data_range * sizeof(float); + // wavewise setting (32 bit) + dst_wave_buffer_resource.config[3] = 0x00027000; + + index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(float); + +#if !CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK for(index_t i = 0; i < 4; ++i) { - amd_buffer_atomic_add( - &p_src[i], p_dst_block, dst_thread_data_offset, dst_const_data_offset + i); + __llvm_amdgcn_buffer_atomic_add_f32( + p_src_thread[i], + dst_wave_buffer_resource.data, + 0, + dst_thread_data_valid ? (dst_thread_addr_offset + i * sizeof(float)) : 0xffffffff, + false); } +#else + uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + + for(index_t i = 0; i < 4; ++i) + { + __llvm_amdgcn_buffer_atomic_add_f32(p_src_thread[i], + dst_wave_buffer_resource.data, + 0, + dst_addr_shift + dst_thread_addr_offset + + i * sizeof(float), + false); + } +#endif } } // namespace ck diff --git a/composable_kernel/include/utility/config.amd.hpp.in b/composable_kernel/include/utility/config.amd.hpp.in index 89a8fd5f60..acb33271a5 100644 --- a/composable_kernel/include/utility/config.amd.hpp.in +++ b/composable_kernel/include/utility/config.amd.hpp.in @@ -49,12 +49,13 @@ #endif // experimental implementation +#ifndef CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK +#define CK_EXPERIMENTAL_AMD_BUFFER_ADDRESSING_USE_OFFSET_TRICK 1 +#endif + +#ifndef CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE #define CK_EXPERIMENTAL_BLOCKWISE_GEMM_USE_PIPELINE 1 -#define CK_EXPERIMENTAL_TENSOR_COORDINATE_USE_CALCULATE_OFFSET_DIFF 0 -#define CK_EXPERIMENTAL_THREADWISE_COPY_V4R2_USE_OPTIMIZED_ADDRESS_CACLULATION 0 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 -#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 +#endif #ifndef CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK #define CK_EXPERIMENTAL_IMPLICIT_GEMM_BACKWARD_DATA_V4R1_OUTPUT_SKIP_OUT_OF_BOUND_CHECK 0 diff --git a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in index 4f99531044..4b274401eb 100644 --- a/composable_kernel/include/utility/in_memory_operation.amd.hpp.in +++ b/composable_kernel/include/utility/in_memory_operation.amd.hpp.in @@ -47,38 +47,69 @@ struct SetData // This version is only for compatibility, don't use this version if possible template - __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const + __device__ void Run(const T* p_src, + index_t src_offset, + bool src_valid, + index_t /* src_range */, + T* p_dst, + index_t dst_offset, + bool dst_valid, + index_t /* dst_range */) const { - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_src[src_offset]); + if(dst_valid) + { + if(src_valid) + { + *reinterpret_cast(&p_dst[dst_offset]) = + *reinterpret_cast(&p_src[src_offset]); + } + else + { + *reinterpret_cast(&p_dst[dst_offset]) = 0; + } + } } #if CK_USE_AMD_BUFFER_ADDRESSING // buffer_load requires: - // 1) p_src must be in global memory space, d_dst must be vgpr - // 2) p_src to be a block-invariant pointer. + // 1) p_src_thread must be in global memory space, p_dst_thread must be vgpr + // 2) p_src_thread to be a wavewise pointer. // It is user's responsibility to make sure that is true. template <> __device__ void Run(const T* p_src, index_t src_offset, + bool src_valid, + index_t src_range, T* p_dst, - index_t dst_offset) const + index_t dst_offset, + bool dst_valid, + index_t /* dst_range */) const { - *reinterpret_cast(&p_dst[dst_offset]) = - amd_buffer_load(p_src, src_offset, 0); + if(dst_valid) + { + *reinterpret_cast(&p_dst[dst_offset]) = + amd_buffer_load(p_src, src_offset, src_valid, src_range); + } } // buffer_store requires: - // 1) p_src must be in vgpr space, d_dst must be global memory - // 2) p_dst to be a block-invariant pointer. + // 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory + // 2) p_dst_thread to be a wavewise pointer. // It is user's responsibility to make sure that is true. template <> __device__ void Run(const T* p_src, index_t src_offset, + bool src_valid, + index_t /* src_range */, T* p_dst, - index_t dst_offset) const + index_t dst_offset, + bool dst_valid, + index_t dst_range) const { - amd_buffer_store(&(p_src[src_offset]), p_dst, dst_offset, 0); + const auto zeros = vector_t(0); + + amd_buffer_store( + src_valid ? &(p_src[src_offset]) : &zeros, p_dst, dst_offset, dst_valid, dst_range); } #endif }; @@ -90,24 +121,43 @@ struct AtomicAddData // This version is only for compatibility, don't use this version if possible template - __device__ void Run(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) const + __device__ void Run(const T* p_src, + index_t src_offset, + bool src_valid, + index_t /* src_range */, + T* p_dst, + index_t dst_offset, + bool dst_valid, + index_t /* dst_range */) const { - atomic_add_impl(reinterpret_cast(&p_dst[dst_offset]), - *reinterpret_cast(&p_src[src_offset])); + if(src_valid && dst_valid) + { + atomic_add_impl(reinterpret_cast(&p_dst[dst_offset]), + *reinterpret_cast(&p_src[src_offset])); + } } #if CK_USE_AMD_BUFFER_ADDRESSING && CK_USE_AMD_BUFFER_ATOMIC_ADD - // buffer_atomic_add requires: - // 1) p_src must be in vgpr space, d_dst must be global memory - // 2) p_dst to be a block-invariant pointer. + // buffer_atomic requires: + // 1) p_src_thread must be in vgpr space, p_dst_thread must be global memory + // 2) p_dst_thread to be a wavewise pointer. // It is user's responsibility to make sure that is true. template <> __device__ void Run(const T* p_src, index_t src_offset, - T* p_dst, - index_t dst_offset) const + index_t /* src_range */, + bool src_valid T* p_dst, + index_t dst_offset, + bool dst_valid, + index_t dst_range) const { - amd_buffer_atomic_add(&(p_src[src_offset]), p_dst, dst_offset, 0); + const auto zeros = vector_t(0); + + amd_buffer_atomic_add(src_valid ? &(p_src[src_offset]) : &zeros, + p_dst, + dst_offset, + dst_valid, + index_t dst_range); } #endif }; @@ -119,7 +169,14 @@ template -__device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, index_t dst_offset) +__device__ void transfer_data(const T* p_src, + index_t src_offset, + bool src_valid, + index_t src_range, + T* p_dst, + index_t dst_offset, + bool dst_valid, + index_t dst_range) { static_assert(DstInMemOp == InMemoryDataOperation::Set || DstInMemOp == InMemoryDataOperation::AtomicAdd, @@ -131,27 +188,41 @@ __device__ void transfer_data(const T* p_src, index_t src_offset, T* p_dst, inde // TODO: use static_if::ElseIf static_if{}([&](auto) { SetData{}.template Run( - p_src, src_offset, p_dst, dst_offset); + p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range); }); static_if{}([&](auto) { AtomicAddData{}.template Run( - p_src, src_offset, p_dst, dst_offset); + p_src, src_offset, src_valid, src_range, p_dst, dst_offset, dst_valid, dst_range); }); } else { - for(index_t i = 0; i < DataPerAccess; i++) + for(index_t i = 0; i < DataPerAccess; ++i) { // TODO: use static_if::ElseIf static_if{}([&](auto) { SetData{}.template Run( - p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride); + p_src, + src_offset + i * SrcDataStride, + src_valid, + src_range, + p_dst, + dst_offset + i * DstDataStride, + dst_valid, + dst_range); }); static_if{}([&](auto) { AtomicAddData{}.template Run( - p_src, src_offset + i * SrcDataStride, p_dst, dst_offset + i * DstDataStride); + p_src, + src_offset + i * SrcDataStride, + src_valid, + src_range, + p_dst, + dst_offset + i * DstDataStride, + dst_valid, + dst_range); }); } }