diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp index f5159ab4aa..d9c954c50f 100644 --- a/include/ck/utility/data_type.hpp +++ b/include/ck/utility/data_type.hpp @@ -19,8 +19,6 @@ struct pk_i4_t type data; __host__ __device__ constexpr pk_i4_t() : data{type{}} {} __host__ __device__ constexpr pk_i4_t(type init) : data{init} {} - - __host__ __device__ constexpr operator float() const { return static_cast(data); } }; inline constexpr auto next_pow2(uint32_t x) diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 639aa1efe4..6de17a6152 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -29,6 +29,13 @@ struct DynamicBuffer ElementSpaceSize element_space_size_; T invalid_element_value_ = T{0}; + static constexpr index_t PackedSize = []() { + if constexpr(is_same_v, pk_i4_t>) + return 2; + else + return 1; + }(); + __host__ __device__ constexpr DynamicBuffer(T* p_data, ElementSpaceSize element_space_size) : p_data_{p_data}, element_space_size_{element_space_size} { @@ -82,14 +89,18 @@ struct DynamicBuffer return amd_buffer_load_invalid_element_return_zero, t_per_x, coherence>( - p_data_, i, is_valid_element, element_space_size_); + p_data_, i, is_valid_element, element_space_size_ / PackedSize); } else { return amd_buffer_load_invalid_element_return_customized_value, t_per_x, coherence>( - p_data_, i, is_valid_element, element_space_size_, invalid_element_value_); + p_data_, + i, + is_valid_element, + element_space_size_ / PackedSize, + invalid_element_value_); } } else @@ -191,7 +202,7 @@ struct DynamicBuffer dst_buf.p_data_, dst_offset, is_valid_element, - element_space_size_); + element_space_size_ / PackedSize); } template , t_per_x, coherence>( - x, p_data_, i, is_valid_element, element_space_size_); + x, p_data_, i, is_valid_element, element_space_size_ / PackedSize); } else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds && is_same>::type, int8_t>::value && @@ -378,7 +389,7 @@ struct DynamicBuffer constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; amd_buffer_atomic_add, t_per_x>( - x, p_data_, i, is_valid_element, element_space_size_); + x, p_data_, i, is_valid_element, element_space_size_ / PackedSize); } else { @@ -417,7 +428,7 @@ struct DynamicBuffer constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector; amd_buffer_atomic_max, t_per_x>( - x, p_data_, i, is_valid_element, element_space_size_); + x, p_data_, i, is_valid_element, element_space_size_ / PackedSize); } else if(is_valid_element) { diff --git a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp index d01d48892c..fe977e766e 100644 --- a/profiler/include/profiler/profile_gemm_b_scale_impl.hpp +++ b/profiler/include/profiler/profile_gemm_b_scale_impl.hpp @@ -342,7 +342,7 @@ bool profile_gemm_b_scale_impl(int do_verification, if(do_log) { LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; - LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; LogRangeAsType( std::cout << "c_host : ", c_m_n_host_result.mData, ",") << std::endl;