mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-20 21:09:08 +00:00
Merge commit 'cdfd7722bfda0181e9ccb75db4161fb95fdef353' into develop
This commit is contained in:
6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
Executable file → Normal file
6
include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
Executable file → Normal file
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -1841,7 +1841,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
|
||||
CShuffleDataType, // typename SrcData,
|
||||
CShuffleDataType, // typename DstData,
|
||||
AccDataType, // typename DstData,
|
||||
decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
|
||||
decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle),
|
||||
Sequence<0, 1, 2, 3>, // typename DimAccessOrder,
|
||||
@@ -2591,7 +2591,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
|
||||
CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
|
||||
Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
|
||||
CShuffleDataType, // typename SrcData,
|
||||
CShuffleDataType, // typename DstData,
|
||||
AccDataType, // typename DstData,
|
||||
decltype(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
|
||||
decltype(c_block_desc_mshuffle_mpershuffle_nshuffle_npershuffle),
|
||||
Sequence<0, 1, 2, 3>, // typename DimAccessOrder,
|
||||
|
||||
52
include/ck/utility/dynamic_buffer.hpp
Executable file → Normal file
52
include/ck/utility/dynamic_buffer.hpp
Executable file → Normal file
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -139,8 +139,7 @@ struct DynamicBuffer
|
||||
template <InMemoryDataOperationEnum Op,
|
||||
typename X,
|
||||
typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
|
||||
typename scalar_type<remove_cvref_t<T>>::type>::value ||
|
||||
!is_native_type<X>(),
|
||||
typename scalar_type<remove_cvref_t<T>>::type>::value,
|
||||
bool>::type = false>
|
||||
__host__ __device__ void Update(IndexType i, bool is_valid_element, const X& x)
|
||||
{
|
||||
@@ -160,37 +159,7 @@ struct DynamicBuffer
|
||||
{
|
||||
auto tmp = this->template Get<X>(i, is_valid_element);
|
||||
using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
|
||||
|
||||
#if defined(__gfx942__) || defined(__gfx950__)
|
||||
|
||||
// Properly handle addition for all low-precision types
|
||||
if constexpr(is_same_v<scalar_t, bhalf_t> || is_same_v<scalar_t, half_t>)
|
||||
{
|
||||
if constexpr(is_scalar_type<X>::value)
|
||||
{
|
||||
// Scalar type: Convert to float, add, convert back
|
||||
auto result =
|
||||
type_convert<X>(type_convert<float>(x) + type_convert<float>(tmp));
|
||||
this->template Set<X>(i, is_valid_element, result);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Vector type
|
||||
constexpr auto vector_size = scalar_type<remove_cvref_t<X>>::vector_size;
|
||||
const vector_type<scalar_t, vector_size> a_vector{tmp};
|
||||
const vector_type<scalar_t, vector_size> b_vector{x};
|
||||
|
||||
// Process each element of the vector in higher precision
|
||||
static_for<0, vector_size, 1>{}([&](auto idx) {
|
||||
auto result = type_convert<scalar_t>(
|
||||
type_convert<float>(a_vector.template AsType<scalar_t>()[idx]) +
|
||||
type_convert<float>(b_vector.template AsType<scalar_t>()[idx]));
|
||||
this->template Set<scalar_t>(i + idx, is_valid_element, result);
|
||||
});
|
||||
}
|
||||
}
|
||||
#else
|
||||
// handle bfloat addition
|
||||
// handle bfloat addition
|
||||
if constexpr(is_same_v<scalar_t, bhalf_t>)
|
||||
{
|
||||
if constexpr(is_scalar_type<X>::value)
|
||||
@@ -218,8 +187,6 @@ struct DynamicBuffer
|
||||
{
|
||||
this->template Set<X>(i, is_valid_element, x + tmp);
|
||||
}
|
||||
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
@@ -273,20 +240,9 @@ struct DynamicBuffer
|
||||
if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
|
||||
{
|
||||
constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
|
||||
using vector_t = typename vector_type_maker<remove_cvref_t<T>, t_per_x>::type::type;
|
||||
vector_t tmp;
|
||||
|
||||
if constexpr(is_same_v<remove_cvref_t<X>, vector_t>)
|
||||
{
|
||||
tmp = x;
|
||||
}
|
||||
else
|
||||
{
|
||||
__builtin_memcpy(&tmp, &x, sizeof(vector_t));
|
||||
}
|
||||
|
||||
amd_buffer_store<remove_cvref_t<T>, t_per_x, coherence>(
|
||||
tmp, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
|
||||
x, p_data_, i, is_valid_element, element_space_size_ / PackedSize);
|
||||
}
|
||||
else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
|
||||
is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
|
||||
|
||||
@@ -1,66 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
|
||||
#include "ck/utility/functional.hpp"
|
||||
#include "ck/utility/sequence.hpp"
|
||||
|
||||
namespace ck::util {
|
||||
|
||||
template <typename Tuple, std::size_t Stride, std::size_t Offset>
|
||||
struct filter_tuple_by_modulo
|
||||
{
|
||||
// Validate Stride and Offset.
|
||||
static_assert(Stride > 0, "Offset must be positive.");
|
||||
static_assert(Offset >= 0 && Offset < Stride,
|
||||
"Offset must be positive and less than the stride.");
|
||||
|
||||
// Generate filtered indices for this stride and offset.
|
||||
static constexpr int new_size = (std::tuple_size_v<Tuple> + Stride - Offset - 1) / Stride;
|
||||
|
||||
template <std::size_t... Is>
|
||||
static constexpr auto to_index(std::index_sequence<Is...>)
|
||||
{
|
||||
return std::index_sequence<(Offset + Is * Stride)...>{};
|
||||
}
|
||||
|
||||
using filtered_indices = decltype(to_index(std::make_index_sequence<new_size>{}));
|
||||
|
||||
// Helper struct to construct the new tuple type from the filtered indices.
|
||||
template <typename T, typename Indices>
|
||||
struct make_filtered_tuple_type_impl;
|
||||
|
||||
template <typename T, std::size_t... Is>
|
||||
struct make_filtered_tuple_type_impl<T, std::index_sequence<Is...>>
|
||||
{
|
||||
using type = std::tuple<std::tuple_element_t<Is, T>...>;
|
||||
};
|
||||
|
||||
using type = typename make_filtered_tuple_type_impl<Tuple, filtered_indices>::type;
|
||||
};
|
||||
|
||||
// Filter a tuple with a stride and offset.
|
||||
//
|
||||
// Tuple is a std::tuple or equivalent
|
||||
// Stride is a positive integer
|
||||
// Offset is a positive integer smaller than ofset
|
||||
//
|
||||
// Evaluates to a smaller tuple type from elements of T with stride M and offset I.
|
||||
//
|
||||
// Can be used to filter a tuple of types for sharded instantiations.
|
||||
template <typename Tuple, std::size_t Stride, std::size_t Offset>
|
||||
using filter_tuple_by_modulo_t = typename filter_tuple_by_modulo<Tuple, Stride, Offset>::type;
|
||||
|
||||
// Example compile-time test:
|
||||
// using OriginalTuple =
|
||||
// std::tuple<int, double, char, float, long, short, bool, char, long long, unsigned int>;
|
||||
// using NewTuple_Every3rdFrom2nd = filter_tuple_by_modulo_t<OriginalTuple, 3, 1>;
|
||||
// static_assert(std::is_same_v<NewTuple_Every3rdFrom2nd, std::tuple<double, long, char>>,
|
||||
// "Test Case 1 Failed: Every 3rd from 2nd");
|
||||
|
||||
} // namespace ck::util
|
||||
Reference in New Issue
Block a user