diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp index c1b52e00a2..7582669e08 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp @@ -194,7 +194,6 @@ struct GridwiseMoeGemm // static constexpr index_t NumTokens = 1; static constexpr index_t SortedTileSize = MPerBlock; - static constexpr auto MakeDsGridPointer() { return generate_tuple( @@ -1471,11 +1470,11 @@ struct GridwiseMoeGemm else if(ActivationOperation == Activation::gelu) { const float scale_up = - p_scale_b[(n0 * NWave * NPerXdl + problem.N) * - PerTokenQuant]; + p_scale_b[(n0 * NWave * NPerXdl + problem.N) * + PerTokenQuant]; auto gate = scale_a * scale_b * c_thread_buf[cidx]; auto up = scale_a * scale_up * c_thread_buf_up[cidx]; - if constexpr (is_same_v, pk_i4_t>) + if constexpr(is_same_v, pk_i4_t>) { gate *= 16; up *= 16; @@ -1490,7 +1489,7 @@ struct GridwiseMoeGemm PerTokenQuant]; auto gate = scale_a * scale_b * c_thread_buf[cidx]; auto up = scale_a * scale_up * c_thread_buf_up[cidx]; - if constexpr (is_same_v, pk_i4_t>) + if constexpr(is_same_v, pk_i4_t>) { gate *= 16; up *= 16; diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 04d3a9790a..1d80f196b5 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -25,7 +25,7 @@ template + typename IndexType = index_t> struct DynamicBuffer { using type = T; @@ -380,13 +380,14 @@ struct DynamicBuffer (is_same_v, half_t> && scalar_per_x_vector % 2 == 0) || (is_same_v, bhalf_t> && scalar_per_x_vector % 2 == 0); #elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT) - bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t) && is_same_v, int32_t>; + bool constexpr use_amd_buffer_addressing = + sizeof(IndexType) <= sizeof(int32_t) && is_same_v, int32_t>; #elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT bool constexpr use_amd_buffer_addressing = - sizeof(IndexType) <= sizeof(int32_t) && ( - is_same_v, float> || - (is_same_v, half_t> && scalar_per_x_vector % 2 == 0) || - (is_same_v, bhalf_t> && scalar_per_x_vector % 2 == 0)); + sizeof(IndexType) <= sizeof(int32_t) && + (is_same_v, float> || + (is_same_v, half_t> && scalar_per_x_vector % 2 == 0) || + (is_same_v, bhalf_t> && scalar_per_x_vector % 2 == 0)); #else bool constexpr use_amd_buffer_addressing = false; #endif @@ -424,8 +425,9 @@ struct DynamicBuffer static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem"); #if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 - using scalar_t = typename scalar_type>::type; - bool constexpr use_amd_buffer_addressing = sizeof(IndexType) <= sizeof(int32_t) && is_same_v, double>; + using scalar_t = typename scalar_type>::type; + bool constexpr use_amd_buffer_addressing = + sizeof(IndexType) <= sizeof(int32_t) && is_same_v, double>; #else bool constexpr use_amd_buffer_addressing = false; #endif @@ -462,7 +464,8 @@ template -__host__ __device__ constexpr auto make_long_dynamic_buffer(T* p, ElementSpaceSize element_space_size) +__host__ __device__ constexpr auto make_long_dynamic_buffer(T* p, + ElementSpaceSize element_space_size) { return DynamicBuffer{ p, element_space_size};