Files
composable_kernel/include/ck/utility/container_helper.hpp
arai713 2e3183af4f Codegen hipRTC compilation (#1579)
* updating codegen build for MIOpen access: adding .cmake for codegen component

* updating CMake

* adding in header guards for some headers due to issues with hiprtc compilation in MIOpen

* some more header guards

* putting env file in header guard

* cleaning up some includes

* updated types file for hiprtc purposes

* fixed types file: bit-wise/memcpy issue

* updating multiple utility files to deal with standard header inclusion for hiprtc

* added some more header guards in the utility files, replacing some standard header functionality

* added some more header guards

* fixing some conflicts in utility files, another round of header guards

* fixing errors in data type file

* resolved conflict errors in a few utility files

* added header guards/replicated functionality in device files

* resolved issues with standard headers in device files: device_base and device_grouped_conv_fwd_multiple_abd

* resolved issues with standard headers in device files: device_base.hpp, device_grouped_conv_fwd_multiple_abd.hpp, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp

* added header guards for gridwise gemm files: gridwise_gemm_multiple_abd_xdl_cshuffle.hpp and gridwise_gemm_multiple_d_xdl_cshuffle.hpp

* fixed issue with numerics header, removed from transform_conv_fwd_to_gemm and added to device_column_to_image_impl, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle, device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3, device_image_to_column_impl

* replaced standard header usage and added header guards in block to ctile map and gridwise_gemm_pipeline_selector

* resolved errors in device_gemm_xdl_splitk_c_shuffle files in regards to replacement of standard headers in previous commit

* added replicated functionality for standard header methods in utility files

* replaced standard header functionality in threadwise tensor slice transfer files and added header guards in element_wise_operation.hpp

* temp fix for namespace error in MIOpen

* remove standard header usage in codegen device op

* removed standard header usage in elementwise files, resolved namespace errors

* formatting fix

* changed codegen argument to ON for testing

* temporarily removing codegen compiler flag for testing purposes

* added codegen flag again, set default to ON

* set codegen flag default back to OFF

* replaced enable_if_t standard header usage in data_type.hpp

* added some debug prints to pinpoint issues in MIOpen

* added print outs to debug in MIOpen

* removed debug print outs from device op

* resolved stdexcept include error

* formatting fix

* adding includes to new fp8 file to resolve ck::enable_if_t errors

* made changes to amd_wave_read_first_lane

* updated functionality in type utility file

* fixed end of file issue

* resovled errors in type utility file, added functionality to array utility file

* fixed standard header usage replication in data_type file, resolves error with failing examples on navi3x

* formatting fix

* replaced standard header usage in amd_ck_fp8 file

* added include to random_gen file

* removed and replicated standard header usage from data_type and type_convert files for fp8 changes

* replicated standard unsigned integer types in random_gen

* resolved comments from review: put calls to reinterpret_cast for size_t in header guards

* updated/added copyright headers

* removed duplicate header

* fixed typo in header guard

* updated copyright headers

---------

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
2025-01-31 09:48:39 -08:00

394 lines
12 KiB
C++

// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
#ifndef CK_CONTAINER_HELPER_HPP
#define CK_CONTAINER_HELPER_HPP
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "array.hpp"
#include "tuple.hpp"
#include "tuple_helper.hpp"
#include "statically_indexed_array.hpp"
#include "container_element_picker.hpp"
namespace ck {
template <typename TData, index_t NSize>
__host__ __device__ constexpr auto container_push_back(const Array<TData, NSize>& a, const TData& x)
{
Array<TData, NSize + 1> r;
static_for<0, NSize, 1>{}([&r, &a ](auto i) constexpr { r(i) = a[i]; });
r(Number<NSize>{}) = x;
return r;
}
template <typename... Ts, typename T>
__host__ __device__ constexpr auto container_push_front(const Tuple<Ts...>& a, const T& x)
{
return container_concat(make_tuple(x), a);
}
template <typename... Ts, typename T>
__host__ __device__ constexpr auto container_push_back(const Tuple<Ts...>& a, const T& x)
{
return container_concat(a, make_tuple(x));
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto
container_reorder_given_new2old(const Array<TData, NSize>& old_array, Sequence<IRs...> /*new2old*/)
{
static_assert(NSize == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return make_array(old_array[Number<IRs>{}]...);
}
template <typename TData, index_t NSize, index_t... IRs>
__host__ __device__ constexpr auto
container_reorder_given_old2new(const Array<TData, NSize>& old_array, Sequence<IRs...> old2new)
{
return container_reorder_given_new2old(
old_array, typename sequence_map_inverse<decltype(old2new)>::type{});
}
template <typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old(const Tuple<Ts...>& old_tuple,
Sequence<IRs...> /*new2old*/)
{
static_assert(sizeof...(Ts) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return make_tuple(old_tuple[Number<IRs>{}]...);
}
template <typename... Ts, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new(const Tuple<Ts...>& old_tuple,
Sequence<IRs...> old2new)
{
return container_reorder_given_new2old(
old_tuple, typename sequence_map_inverse<decltype(old2new)>::type{});
}
template <index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_new2old(Sequence<Is...> /* old_seq */,
Sequence<IRs...> /*new2old*/)
{
static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
return Sequence<Sequence<Is...>::At(Number<IRs>{})...>{};
}
template <index_t... Is, index_t... IRs>
__host__ __device__ constexpr auto container_reorder_given_old2new(Sequence<Is...> old_seq,
Sequence<IRs...> /* old2new */)
{
static_assert(sizeof...(Is) == sizeof...(IRs), "wrong! size not consistent");
static_assert(is_valid_sequence_map<Sequence<IRs...>>{}, "wrong! invalid reorder map");
constexpr auto new2old = typename sequence_map_inverse<Sequence<IRs...>>::type{};
return container_reorder_given_new2old(old_seq, new2old);
}
#if !CK_WORKAROUND_SWDEV_275126
// rocm-4.1 compiler would crash for recursive lambda
template <typename Container,
typename Reduce,
typename Init,
index_t IBegin = 0,
index_t IEnd = Container::Size(),
index_t IStep = 1>
__host__ __device__ constexpr auto container_reduce(const Container& x,
Reduce reduce,
Init init,
Number<IBegin> = Number<0>{},
Number<IEnd> = Number<Container::Size()>{},
Number<IStep> = Number<1>{})
{
static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
// f is recursive function, fs is a dummy of f
// i is index, y_old is current scan, r_old is current reduction
auto f = [&](auto fs, auto i, auto r_old) {
auto r_new = reduce(x[i], r_old);
if constexpr(i.value < IEnd - IStep)
{
// recursively call f/fs
return fs(fs, i + Number<IStep>{}, r_new);
}
else
{
return r_new;
}
};
// start recursion
return f(f, Number<IBegin>{}, init);
}
#else
// i is index, y_old is current scan, r_old is current reduction
template <typename Container,
typename Reduce,
typename ROld,
index_t I,
index_t IEnd,
index_t IStep>
__host__ __device__ constexpr auto container_reduce_impl(
const Container& x, Reduce reduce, ROld r_old, Number<I> i, Number<IEnd>, Number<IStep>)
{
auto r_new = reduce(x[i], r_old);
if constexpr(i.value < IEnd - IStep)
{
return container_reduce_impl(
x, reduce, r_new, i + Number<IStep>{}, Number<IEnd>{}, Number<IStep>{});
}
else
{
return r_new;
}
}
// rocm-4.1 compiler would crash for recursive lambda
// container reduce with initial value
template <typename Container,
typename Reduce,
typename Init,
index_t IBegin = 0,
index_t IEnd = Container::Size(),
index_t IStep = 1>
__host__ __device__ constexpr auto container_reduce(const Container& x,
Reduce reduce,
Init init,
Number<IBegin> = Number<0>{},
Number<IEnd> = Number<Container::Size()>{},
Number<IStep> = Number<1>{})
{
static_assert((IEnd - IBegin) % IStep == 0, "wrong!");
if constexpr(IEnd > IBegin)
{
return container_reduce_impl(
x, reduce, init, Number<IBegin>{}, Number<IEnd>{}, Number<IStep>{});
}
else
{
return init;
}
}
#endif
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
container_reverse_inclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y;
}
template <typename TData, index_t NSize, typename Reduce>
__host__ __device__ constexpr auto
container_reverse_exclusive_scan(const Array<TData, NSize>& x, Reduce f, TData init)
{
Array<TData, NSize> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
y(i) = r;
r = f(r, x[i]);
});
y(Number<0>{}) = r;
return y;
}
template <index_t... Is, typename Reduce, index_t Init>
__host__ __device__ constexpr auto
container_reverse_exclusive_scan(const Sequence<Is...>& seq, Reduce f, Number<Init>)
{
return reverse_exclusive_scan_sequence(seq, f, Number<Init>{});
}
#if !CK_WORKAROUND_SWDEV_275126
// rocm4.1 compiler would crash with recursive lambda
template <typename... Xs, typename Reduce, typename Init>
__host__ __device__ constexpr auto
container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
{
constexpr index_t NSize = sizeof...(Xs);
// f is recursive function, fs is a dummy of f
// i is index, y_old is current scan, r_old is current reduction
auto f = [&](auto fs, auto i, auto y_old, auto r_old) {
auto r_new = reduce(x[i], r_old);
auto y_new = container_push_front(y_old, r_new);
if constexpr(i.value > 1)
{
// recursively call f/fs
return fs(fs, i - Number<1>{}, y_new, r_new);
}
else
{
return y_new;
}
};
// start recursion
return f(f, Number<NSize - 1>{}, make_tuple(init), init);
}
#else
// i is index, y_old is current scan, r_old is current reduction
template <typename... Xs, typename Reduce, index_t I, typename YOld, typename ROld>
__host__ __device__ constexpr auto container_reverse_exclusive_scan_impl(
const Tuple<Xs...>& x, Reduce reduce, Number<I> i, YOld y_old, ROld r_old)
{
auto r_new = reduce(x[i], r_old);
auto y_new = container_push_front(y_old, r_new);
if constexpr(i.value > 1)
{
// recursively call f/fs
return container_reverse_exclusive_scan_impl(x, reduce, i - Number<1>{}, y_new, r_new);
}
else
{
return y_new;
}
}
template <typename... Xs, typename Reduce, typename Init>
__host__ __device__ constexpr auto
container_reverse_exclusive_scan(const Tuple<Xs...>& x, Reduce reduce, Init init)
{
constexpr index_t NSize = sizeof...(Xs);
return container_reverse_exclusive_scan_impl(
x, reduce, Number<NSize - 1>{}, make_tuple(init), init);
}
#endif
// TODO: update to like container_reverse_exclusive_scan to deal with Tuple of Numebr<>
template <typename... Xs, typename Reduce, typename TData>
__host__ __device__ constexpr auto
container_reverse_inclusive_scan(const Tuple<Xs...>& x, Reduce f, TData init)
{
constexpr index_t NSize = sizeof...(Xs);
Tuple<Xs...> y;
TData r = init;
static_for<NSize - 1, 0, -1>{}([&](auto i) {
r = f(r, x[i]);
y(i) = r;
});
r = f(r, x[Number<0>{}]);
y(Number<0>{}) = r;
return y;
}
template <typename X, typename... Ys>
__host__ __device__ constexpr auto container_concat(const X& x, const Ys&... ys)
{
return container_concat(x, container_concat(ys...));
}
template <typename T, index_t NX, index_t NY>
__host__ __device__ constexpr auto container_concat(const Array<T, NX>& ax, const Array<T, NY>& ay)
{
return unpack2(
[&](auto&&... zs) { return make_array(ck::forward<decltype(zs)>(zs)...); }, ax, ay);
}
template <typename... X, typename... Y>
__host__ __device__ constexpr auto container_concat(const Tuple<X...>& tx, const Tuple<Y...>& ty)
{
return unpack2(
[&](auto&&... zs) { return make_tuple(ck::forward<decltype(zs)>(zs)...); }, tx, ty);
}
template <typename Container>
__host__ __device__ constexpr auto container_concat(const Container& x)
{
return x;
}
template <typename T, index_t N, index_t... Is>
__host__ __device__ constexpr auto get_container_subset(const Array<T, N>& arr, Sequence<Is...>)
{
static_assert(N >= sizeof...(Is), "wrong! size");
return make_array(arr[Number<Is>{}]...);
}
template <typename... Ts, index_t... Is>
__host__ __device__ constexpr auto get_container_subset(const Tuple<Ts...>& tup, Sequence<Is...>)
{
static_assert(sizeof...(Ts) >= sizeof...(Is), "wrong! size");
return make_tuple(tup[Number<Is>{}]...);
}
template <typename T, index_t N, index_t... Is>
__host__ __device__ constexpr void
set_container_subset(Array<T, N>& y, Sequence<Is...> picks, const Array<T, sizeof...(Is)>& x)
{
static_assert(N >= sizeof...(Is), "wrong! size");
static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
}
template <typename... Ys, index_t... Is, typename... Xs>
__host__ __device__ constexpr void
set_container_subset(Tuple<Ys...>& y, Sequence<Is...> picks, const Tuple<Xs...>& x)
{
static_assert(sizeof...(Ys) >= sizeof...(Is) && sizeof...(Is) == sizeof...(Xs), "wrong! size");
static_for<0, sizeof...(Is), 1>{}([&](auto i) { y(picks[i]) = x[i]; });
}
template <index_t... Is>
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence<Is...>)
{
using Seq = Sequence<Is...>;
return generate_tuple(
[&](auto i) {
constexpr index_t tmp = Seq::At(i);
return Number<tmp>{};
},
Seq::Size());
}
} // namespace ck
#endif