diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp index a15be541b5..03f889649e 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm_dlops_v3.hpp @@ -133,7 +133,7 @@ struct BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 static_assert(WPerThread % WoPerThreadSubC == 0, ""); // thread A buffer for GEMM - StaticBuffer + StaticBuffer a_thread_buf; constexpr auto threadwise_gemm = ThreadwiseGemmDlops_km_kn_mn_v3 + c_k_n_ho_wo_thread_desc.GetElementSpaceSize(), + true> c_thread_buf; // initialize output thread tensor @@ -251,7 +252,8 @@ struct GridwiseGemmDlops_km_kn_mn_v3 // double regsiter buffer for b StaticBuffer + b_e_n_ho_wo_thread_desc.GetElementSpaceSize(), + true> b_thread_even_buf, b_thread_odd_buf; // LDS double buffer: preload data diff --git a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp index 06352edce3..dcb16e5dcd 100644 --- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp +++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v2r3.hpp @@ -402,7 +402,8 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 StaticBuffer, - c_mr_nr_blk_desc.GetElementSpaceSize()> + c_mr_nr_blk_desc.GetElementSpaceSize(), + true> c_thread_buf; // LDS allocation for A and B: be careful of alignment @@ -493,7 +494,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3 Number{}, Number<1>{})); - StaticBuffer + StaticBuffer c_blk_buf_; static_for<0, MRepeat, 1>{}([&](auto mr_i) { diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp index db86c1e729..82e46984e2 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer.hpp @@ -1242,7 +1242,7 @@ struct ThreadwiseTensorSliceTransfer_v3 static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); - StaticBuffer buffer_; + StaticBuffer buffer_; SrcCoord src_coord_; DstCoord dst_coord_; diff --git a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp index ceac47a364..6d96aa1253 100644 --- a/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_tensor_slice_transfer_v2.hpp @@ -602,7 +602,7 @@ struct ThreadwiseTensorSliceTransfer_v3r1 static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize(); - StaticBuffer buffer_; + StaticBuffer buffer_; SrcCoord src_coord_; DstCoord dst_coord_; diff --git a/composable_kernel/include/utility/amd_buffer_addressing.hpp b/composable_kernel/include/utility/amd_buffer_addressing.hpp index 711af2e648..57081b7fd7 100644 --- a/composable_kernel/include/utility/amd_buffer_addressing.hpp +++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp @@ -10,25 +10,25 @@ union BufferResource { // 128 bit SGPRs to supply buffer resource in buffer instructions // https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions - int32x4_t data; + int32x4_t content; StaticallyIndexedArray address; StaticallyIndexedArray range; StaticallyIndexedArray config; }; template -__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t data_space_size) +__device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_size) { BufferResource wave_buffer_resource; // wavewise base address (64 bit) wave_buffer_resource.address(Number<0>{}) = const_cast*>(p_wave); // wavewise range (32 bit) - wave_buffer_resource.range(Number<2>{}) = data_space_size * sizeof(T); + wave_buffer_resource.range(Number<2>{}) = element_space_size * sizeof(T); // wavewise setting (32 bit) wave_buffer_resource.config(Number<3>{}) = CK_BUFFER_RESOURCE_3RD_DWORD; - return wave_buffer_resource.data; + return wave_buffer_resource.content; } // load @@ -204,10 +204,9 @@ llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32"); template -__device__ typename vector_type::type -amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, - index_t src_thread_addr_offset, - index_t src_wave_addr_offset) +__device__ typename vector_type::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, + index_t src_thread_addr_offset, + index_t src_wave_addr_offset) { static_assert( (is_same::value && (N == 1 || N == 2 || N == 4 || N == 8)) || @@ -412,10 +411,10 @@ amd_buffer_load_impl_v2(int32x4_t src_wave_buffer_resource, } template -__device__ void amd_buffer_store_impl_v2(const typename vector_type::type src_thread_data, - int32x4_t dst_wave_buffer_resource, - index_t dst_thread_addr_offset, - index_t dst_wave_addr_offset) +__device__ void amd_buffer_store_impl(const typename vector_type::type src_thread_data, + int32x4_t dst_wave_buffer_resource, + index_t dst_thread_addr_offset, + index_t dst_wave_addr_offset) { static_assert( (is_same::value && (N == 1 || N == 2 || N == 4)) || @@ -584,67 +583,95 @@ __device__ void amd_buffer_store_impl_v2(const typename vector_type::type // buffer_load requires: // 1) p_src_wave must be in global memory space -// 2) p_src_wave to be a wavewise pointer. +// 2) p_src_wave must be a wavewise pointer. // It is user's responsibility to make sure that is true. template __device__ typename vector_type_maker::type::type -amd_buffer_load_v2(const T* p_src_wave, - index_t src_thread_data_offset, - bool src_thread_data_valid, - index_t src_element_space) +amd_buffer_load_invalid_element_return_return_zero(const T* p_src_wave, + index_t src_thread_element_offset, + bool src_thread_element_valid, + index_t src_element_space_size) { const int32x4_t src_wave_buffer_resource = - make_wave_buffer_resource(p_src_wave, src_element_space); + make_wave_buffer_resource(p_src_wave, src_element_space_size); - index_t src_thread_addr_offset = src_thread_data_offset * sizeof(T); + index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); + + using vector_t = typename vector_type_maker::type::type; + using scalar_t = typename scalar_type::type; - using vector_t = typename vector_type_maker::type::type; - using scalar_t = typename scalar_type::type; constexpr index_t vector_size = scalar_type::vector_size; #if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK - uint32_t src_addr_shift = src_thread_data_valid ? 0 : 0x7fffffff; + uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x7fffffff; - return amd_buffer_load_impl_v2( + return amd_buffer_load_impl( src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0); #else - vector_t tmp = amd_buffer_load_impl_v2( + vector_t tmp = amd_buffer_load_impl( src_wave_buffer_resource, src_thread_addr_offset, 0); - return src_thread_data_valid ? tmp : vector_t(0); + return src_thread_element_valid ? tmp : vector_t(0); #endif } +// buffer_load requires: +// 1) p_src_wave must be in global memory space +// 2) p_src_wave must be a wavewise pointer. +// It is user's responsibility to make sure that is true. +template +__device__ typename vector_type_maker::type::type +amd_buffer_load_invalid_element_return_customized_value(const T* p_src_wave, + index_t src_thread_element_offset, + bool src_thread_element_valid, + index_t src_element_space_size, + T customized_value) +{ + const int32x4_t src_wave_buffer_resource = + make_wave_buffer_resource(p_src_wave, src_element_space_size); + + index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T); + + using vector_t = typename vector_type_maker::type::type; + using scalar_t = typename scalar_type::type; + + constexpr index_t vector_size = scalar_type::vector_size; + + vector_t tmp = amd_buffer_load_impl( + src_wave_buffer_resource, src_thread_addr_offset, 0); + + return src_thread_element_valid ? tmp : vector_t(customized_value); +} + // buffer_store requires: // 1) p_dst_wave must be global memory // 2) p_dst_wave to be a wavewise pointer. // It is user's responsibility to make sure that is true. template -__device__ void -amd_buffer_store_v2(const typename vector_type_maker::type::type src_thread_data, - T* p_dst_wave, - const index_t dst_thread_data_offset, - const bool dst_thread_data_valid, - const index_t dst_element_space) +__device__ void amd_buffer_store(const typename vector_type_maker::type::type src_thread_data, + T* p_dst_wave, + const index_t dst_thread_element_offset, + const bool dst_thread_element_valid, + const index_t dst_element_space_size) { const int32x4_t dst_wave_buffer_resource = - make_wave_buffer_resource(p_dst_wave, dst_element_space); + make_wave_buffer_resource(p_dst_wave, dst_element_space_size); - index_t dst_thread_addr_offset = dst_thread_data_offset * sizeof(T); + index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T); using vector_t = typename vector_type_maker::type::type; using scalar_t = typename scalar_type::type; constexpr index_t vector_size = scalar_type::vector_size; #if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK - uint32_t dst_addr_shift = dst_thread_data_valid ? 0 : 0x7fffffff; + uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff; - amd_buffer_store_impl_v2( + amd_buffer_store_impl( src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0); #else - if(dst_thread_data_valid) + if(dst_thread_element_valid) { - amd_buffer_store_impl_v2( + amd_buffer_store_impl( src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0); } #endif diff --git a/composable_kernel/include/utility/dynamic_buffer.hpp b/composable_kernel/include/utility/dynamic_buffer.hpp index 920a716765..6f54f7317c 100644 --- a/composable_kernel/include/utility/dynamic_buffer.hpp +++ b/composable_kernel/include/utility/dynamic_buffer.hpp @@ -6,34 +6,43 @@ namespace ck { -template +template struct DynamicBuffer { using type = T; T* p_data_; ElementSpaceSize element_space_size_; + T invalid_element_value_ = T{0}; __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size) : p_data_{p_data}, element_space_size_{element_space_size} { } + __host__ __device__ constexpr DynamicBuffer(T* p_data, + ElementSpaceSize element_space_size, + T invalid_element_value) + : p_data_{p_data}, + element_space_size_{element_space_size}, + invalid_element_value_{invalid_element_value} + { + } + __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() { return BufferAddressSpace; } - __host__ __device__ constexpr const T& operator[](index_t i) const { return p_data_[i]; } - - __host__ __device__ constexpr T& operator()(index_t i) { return p_data_[i]; } - template >>::type, typename scalar_type>>::type>::value, bool>::type = false> - __host__ __device__ constexpr auto Get(index_t i, bool is_valid_offset) const + __host__ __device__ constexpr auto Get(index_t i, bool is_valid_element) const { // X contains multiple T constexpr index_t scalar_per_t_vector = @@ -45,20 +54,41 @@ struct DynamicBuffer static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X need to be multiple T"); - if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global) - { #if CK_USE_AMD_BUFFER_ADDRESSING + bool constexpr use_amd_buffer_addressing = true; +#else + bool constexpr use_amd_buffer_addressing = false; +#endif + + if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing) + { constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - return amd_buffer_load_v2>, t_per_x>( - p_data_, i, is_valid_offset, element_space_size_); -#else - return is_valid_offset ? *c_style_pointer_cast(&p_data_[i]) : X{0}; -#endif + if constexpr(InvalidElementUseNumericalZeroValue) + { + return amd_buffer_load_invalid_element_return_return_zero< + remove_cv_t>, + t_per_x>(p_data_, i, is_valid_element, element_space_size_); + } + else + { + return amd_buffer_load_invalid_element_return_customized_value< + remove_cv_t>, + t_per_x>( + p_data_, i, is_valid_element, element_space_size_, invalid_element_value_); + } } else { - return is_valid_offset ? *c_style_pointer_cast(&p_data_[i]) : X{0}; + if constexpr(InvalidElementUseNumericalZeroValue) + { + return is_valid_element ? *c_style_pointer_cast(&p_data_[i]) : X{0}; + } + else + { + return is_valid_element ? *c_style_pointer_cast(&p_data_[i]) + : X{invalid_element_value_}; + } } } @@ -67,7 +97,7 @@ struct DynamicBuffer is_same>>::type, typename scalar_type>>::type>::value, bool>::type = false> - __host__ __device__ void Set(index_t i, bool is_valid_offset, const X& x) + __host__ __device__ void Set(index_t i, bool is_valid_element, const X& x) { // X contains multiple T constexpr index_t scalar_per_t_vector = @@ -84,10 +114,10 @@ struct DynamicBuffer #if CK_USE_AMD_BUFFER_ADDRESSING constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; - amd_buffer_store_v2>, t_per_x>( - x, p_data_, i, is_valid_offset, element_space_size_); + amd_buffer_store>, t_per_x>( + x, p_data_, i, is_valid_element, element_space_size_); #else - if(is_valid_offset) + if(is_valid_element) { *c_style_pointer_cast(&p_data_[i]) = x; } @@ -95,7 +125,7 @@ struct DynamicBuffer } else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds) { - if(is_valid_offset) + if(is_valid_element) { #if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE *c_style_pointer_cast(&p_data_[i]) = x; @@ -185,7 +215,7 @@ struct DynamicBuffer } else { - if(is_valid_offset) + if(is_valid_element) { *c_style_pointer_cast(&p_data_[i]) = x; } @@ -197,12 +227,18 @@ struct DynamicBuffer __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; } }; -template +template __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size) { - return DynamicBuffer{p, element_space_size}; + return DynamicBuffer{p, element_space_size}; +} + +template +__host__ __device__ constexpr auto +make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, T invalid_element_value) +{ + return DynamicBuffer{ + p, element_space_size, invalid_element_value}; } } // namespace ck diff --git a/composable_kernel/include/utility/static_buffer.hpp b/composable_kernel/include/utility/static_buffer.hpp index a23cf4f80d..cd67b8a0be 100644 --- a/composable_kernel/include/utility/static_buffer.hpp +++ b/composable_kernel/include/utility/static_buffer.hpp @@ -5,30 +5,66 @@ namespace ck { -template +template struct StaticBuffer : public StaticallyIndexedArray { using type = T; using base = StaticallyIndexedArray; + T invalid_element_value_ = T{0}; + __host__ __device__ constexpr StaticBuffer() : base{} {} + __host__ __device__ constexpr StaticBuffer(T invalid_element_value) + : base{}, invalid_element_value_{invalid_element_value} + { + } + __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace() { return BufferAddressSpace; } + template + __host__ __device__ constexpr auto Get(Number i, bool is_valid_element) const + { + if constexpr(InvalidElementUseNumericalZeroValue) + { + return is_valid_element ? At(i) : T{0}; + } + else + { + return is_valid_element ? At(i) : invalid_element_value_; + } + } + + template + __host__ __device__ void Set(Number i, bool is_valid_element, const T& x) + { + if(is_valid_element) + { + At(i) = x; + } + } + __host__ __device__ static constexpr bool IsStaticBuffer() { return true; } __host__ __device__ static constexpr bool IsDynamicBuffer() { return false; } }; -template +template __host__ __device__ constexpr auto make_static_buffer(Number) { - return StaticBuffer{}; + return StaticBuffer{}; +} + +template +__host__ __device__ constexpr auto make_static_buffer(Number, T invalid_element_value) +{ + return StaticBuffer{invalid_element_value}; } } // namespace ck diff --git a/host/driver_offline/src/conv_fwd_driver_offline.cpp b/host/driver_offline/src/conv_fwd_driver_offline.cpp index 161d17a4de..32c33003c5 100644 --- a/host/driver_offline/src/conv_fwd_driver_offline.cpp +++ b/host/driver_offline/src/conv_fwd_driver_offline.cpp @@ -21,7 +21,7 @@ #define USE_MODE 1 #define USE_CONV_FWD_V4R4_NCHW 1 -#define USE_CONV_FWD_V4R4R2_NHWC 0 +#define USE_CONV_FWD_V4R4R2_NHWC 1 #define USE_CONV_FWD_V6R1_NCHW 0 #define USE_CONV_FWD_V5R1_NCHW 0 #define USE_CONV_FWD_V4R4R2_XDL_NCHW 0