mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 19:09:59 +00:00
* Initial adding of generic reduction
* Initial adding of generic reduction ...
* Updates to make compiling done
* clang-format all files
* clang-format some files again
* Renaming in profiler/include/profile_reduce.hpp
* Updates and make BlockWise cases passed
* Updates and make ThreadWise and MultiBlockTwoCall cases passed
* Remove the support for MUL and NORM1 reduceOp from the profiler and the device instances
* Change to replace the dim0_max_vector_size/dim1_max_vector_size template argument in the device reduce classes
* format
* adding pooling
* added max and average pooling
* comment out cout and kernel timing
* Tiny simplification in profiler/reduce_profiler.cpp
* Add example for reduce_blockwise
* Tiny updates
* Change to pass the ElementWiseOp from device layer to kernel
* Fix the vectorDim and vectorSize in Device layer
* Enable vector load on both dim0 and dim1 for Threadwise method
* Tiny updates
* Change to let the user to pass the preUnaryOp and posUnaryOp
* Make pooling example work
* split device_reduce_instance into two libraries
* Tiny update
* Replace nanPropaOpt enum by boolean propagate_nan
* Simplification in DeviceReduce layer codes
* update build
* Change to clarify the difference between ck::half_t and half_float::half
* Renaming in all the reduction codes
* Add VectorSize as template parameter for device layer
* Add BetaIsZero as kernel template and as AccDataType for alpha
* print
* Small updates for pooling
* Updates for host_generic_reduction for reference
* Update to make AVG pooling pass
* Update to make MAX pooling with indices output pass
* fix
* add OutDst vector store to threadwise reduction and pooling
* tweak
* turn off check_indices that caused build issue
* refactor pooling
* clean up
* turn off check_indices for building issue for php-compiler
* add more tile size for odd C
* tweak conv for odd C
* update script
* clean up elementwise op
* add hack in reduction_operator.hpp to avoid compile error. To fix it, need to use element_wise_op in reduction op
* Add OutVectorSize as device and kernel tunable, also update to Elementwise Operations
* Move reduce operator mapping to host layer file reduction_operator_mapping.hpp from reduction_operator.hpp
* Change to the unary operators
* Move the definitions of unary operations to element_wise_operation.hpp
* re-org files
* Refine in device interfaces and multiblock kernels
* Split the reduction configurations into instances for specific methods
* Update in getTypeString() of device pool2d
* Renaming in host and kernel
* Tiny update in profiler/src/profiler.cpp
* Uncomment in device_operation/CMakeLists.txt to enable the building of all operations
* Make check_indices a templated function to remove some linking issue
* Renaming in the profiler reduce module
* Add support for double Reduction (but disable MultiblockAtomicAdd for double)
* Tiny correction of literal string
* Rename DevicePoolFwd to DevicePool2dFwd
* Split device_reduce_instance_xxx.cpp files according to the data types to speed up compiling
* Add comments for lists of configurations, lists of instances and references of add_reduce_instances_xxx
* Remove un-used header file gridwise_generic_reduction_wrapper_common.hpp
* Renaming and refining in the Reduction codes
* Tiny change in the unary operators
* Renaming symbols and files
* Renaming symbols in the kernels
* Move kernel kernel_set_buffer_value to separate file
* Add IndexDataType template parameter for kernels and use int32_t as index data type in device layer
* Tiny update in the kernels
* Remove definition of sqrtf()/isnan()/abs() for half_t due to some ADL issue
* Simplify a helper function in device layer
* Tiny adjustment in testing data initialization
* Renaming in kernel/device/host
* Add two testing scripts for reduction
* Refine the Unary operators in element_wise_operation.hpp
* Update in the reduce profiler module
* Update to the reduction testing scripts
* reduce compile parallelism
* change CI docker to rocm5.0
* remove unused variables
* fix build
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: e17c0d8008]
384 lines
11 KiB
C++
384 lines
11 KiB
C++
#ifndef HOST_TENSOR_HPP
|
|
#define HOST_TENSOR_HPP
|
|
|
|
#include <thread>
|
|
#include <vector>
|
|
#include <numeric>
|
|
#include <algorithm>
|
|
#include <utility>
|
|
#include <cassert>
|
|
#include <iostream>
|
|
#include "data_type.hpp"
|
|
|
|
template <typename Range>
|
|
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
|
|
{
|
|
bool first = true;
|
|
for(auto&& v : range)
|
|
{
|
|
if(first)
|
|
first = false;
|
|
else
|
|
os << delim;
|
|
os << v;
|
|
}
|
|
return os;
|
|
}
|
|
|
|
template <typename T, typename Range>
|
|
std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
|
|
{
|
|
bool first = true;
|
|
for(auto&& v : range)
|
|
{
|
|
if(first)
|
|
first = false;
|
|
else
|
|
os << delim;
|
|
os << static_cast<T>(v);
|
|
}
|
|
return os;
|
|
}
|
|
|
|
typedef enum
|
|
{
|
|
Half = 0,
|
|
Float = 1,
|
|
} DataType_t;
|
|
|
|
template <typename T>
|
|
struct DataType;
|
|
|
|
template <>
|
|
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
|
|
{
|
|
};
|
|
|
|
template <typename F, typename T, std::size_t... Is>
|
|
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
|
{
|
|
return f(std::get<Is>(args)...);
|
|
}
|
|
|
|
template <typename F, typename T>
|
|
auto call_f_unpack_args(F f, T args)
|
|
{
|
|
constexpr std::size_t N = std::tuple_size<T>{};
|
|
|
|
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
|
}
|
|
|
|
template <typename F, typename T, std::size_t... Is>
|
|
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
|
{
|
|
return F(std::get<Is>(args)...);
|
|
}
|
|
|
|
template <typename F, typename T>
|
|
auto construct_f_unpack_args(F, T args)
|
|
{
|
|
constexpr std::size_t N = std::tuple_size<T>{};
|
|
|
|
return construct_f_unpack_args_impl<F>(args, std::make_index_sequence<N>{});
|
|
}
|
|
|
|
struct HostTensorDescriptor
|
|
{
|
|
HostTensorDescriptor() = delete;
|
|
|
|
template <typename X>
|
|
HostTensorDescriptor(std::vector<X> lens);
|
|
|
|
template <typename X, typename Y>
|
|
HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
|
|
|
|
void CalculateStrides();
|
|
|
|
template <typename Range>
|
|
HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
|
|
{
|
|
this->CalculateStrides();
|
|
}
|
|
|
|
template <typename Range1, typename Range2>
|
|
HostTensorDescriptor(const Range1& lens, const Range2& strides)
|
|
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
|
{
|
|
}
|
|
|
|
std::size_t GetNumOfDimension() const;
|
|
std::size_t GetElementSize() const;
|
|
std::size_t GetElementSpace() const;
|
|
|
|
const std::vector<std::size_t>& GetLengths() const;
|
|
const std::vector<std::size_t>& GetStrides() const;
|
|
|
|
template <typename... Is>
|
|
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
|
{
|
|
assert(sizeof...(Is) == this->GetNumOfDimension());
|
|
std::initializer_list<std::size_t> iss{static_cast<std::size_t>(is)...};
|
|
return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
|
|
}
|
|
|
|
friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc);
|
|
|
|
private:
|
|
std::vector<std::size_t> mLens;
|
|
std::vector<std::size_t> mStrides;
|
|
};
|
|
|
|
struct joinable_thread : std::thread
|
|
{
|
|
template <typename... Xs>
|
|
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
|
{
|
|
}
|
|
|
|
joinable_thread(joinable_thread&&) = default;
|
|
joinable_thread& operator=(joinable_thread&&) = default;
|
|
|
|
~joinable_thread()
|
|
{
|
|
if(this->joinable())
|
|
this->join();
|
|
}
|
|
};
|
|
|
|
template <typename F, typename... Xs>
|
|
struct ParallelTensorFunctor
|
|
{
|
|
F mF;
|
|
static constexpr std::size_t NDIM = sizeof...(Xs);
|
|
std::array<std::size_t, NDIM> mLens;
|
|
std::array<std::size_t, NDIM> mStrides;
|
|
std::size_t mN1d;
|
|
|
|
ParallelTensorFunctor(F f, Xs... xs) : mF(f), mLens({static_cast<std::size_t>(xs)...})
|
|
{
|
|
mStrides.back() = 1;
|
|
std::partial_sum(mLens.rbegin(),
|
|
mLens.rend() - 1,
|
|
mStrides.rbegin() + 1,
|
|
std::multiplies<std::size_t>());
|
|
mN1d = mStrides[0] * mLens[0];
|
|
}
|
|
|
|
std::array<std::size_t, NDIM> GetNdIndices(std::size_t i) const
|
|
{
|
|
std::array<std::size_t, NDIM> indices;
|
|
|
|
for(int idim = 0; idim < NDIM; ++idim)
|
|
{
|
|
indices[idim] = i / mStrides[idim];
|
|
i -= indices[idim] * mStrides[idim];
|
|
}
|
|
|
|
return indices;
|
|
}
|
|
|
|
void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
|
|
{
|
|
std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;
|
|
|
|
std::vector<joinable_thread> threads(num_thread);
|
|
|
|
for(std::size_t it = 0; it < num_thread; ++it)
|
|
{
|
|
std::size_t iw_begin = it * work_per_thread;
|
|
std::size_t iw_end = std::min((it + 1) * work_per_thread, mN1d);
|
|
|
|
auto f = [=] {
|
|
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
|
|
{
|
|
call_f_unpack_args(mF, GetNdIndices(iw));
|
|
}
|
|
};
|
|
threads[it] = joinable_thread(f);
|
|
}
|
|
}
|
|
};
|
|
|
|
template <typename F, typename... Xs>
|
|
auto make_ParallelTensorFunctor(F f, Xs... xs)
|
|
{
|
|
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
|
}
|
|
|
|
template <typename T>
|
|
struct Tensor
|
|
{
|
|
template <typename X>
|
|
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
|
|
{
|
|
}
|
|
|
|
template <typename X>
|
|
Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
|
|
{
|
|
}
|
|
|
|
template <typename X, typename Y>
|
|
Tensor(std::vector<X> lens, std::vector<Y> strides)
|
|
: mDesc(lens, strides), mData(mDesc.GetElementSpace())
|
|
{
|
|
}
|
|
|
|
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
|
|
|
|
template <typename G>
|
|
void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
|
|
{
|
|
switch(mDesc.GetNumOfDimension())
|
|
{
|
|
case 1: {
|
|
auto f = [&](auto i) { (*this)(i) = g(i); };
|
|
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0])(num_thread);
|
|
break;
|
|
}
|
|
case 2: {
|
|
auto f = [&](auto i0, auto i1) { (*this)(i0, i1) = g(i0, i1); };
|
|
make_ParallelTensorFunctor(f, mDesc.GetLengths()[0], mDesc.GetLengths()[1])(num_thread);
|
|
break;
|
|
}
|
|
case 3: {
|
|
auto f = [&](auto i0, auto i1, auto i2) { (*this)(i0, i1, i2) = g(i0, i1, i2); };
|
|
make_ParallelTensorFunctor(
|
|
f, mDesc.GetLengths()[0], mDesc.GetLengths()[1], mDesc.GetLengths()[2])(num_thread);
|
|
break;
|
|
}
|
|
case 4: {
|
|
auto f = [&](auto i0, auto i1, auto i2, auto i3) {
|
|
(*this)(i0, i1, i2, i3) = g(i0, i1, i2, i3);
|
|
};
|
|
make_ParallelTensorFunctor(f,
|
|
mDesc.GetLengths()[0],
|
|
mDesc.GetLengths()[1],
|
|
mDesc.GetLengths()[2],
|
|
mDesc.GetLengths()[3])(num_thread);
|
|
break;
|
|
}
|
|
case 5: {
|
|
auto f = [&](auto i0, auto i1, auto i2, auto i3, auto i4) {
|
|
(*this)(i0, i1, i2, i3, i4) = g(i0, i1, i2, i3, i4);
|
|
};
|
|
make_ParallelTensorFunctor(f,
|
|
mDesc.GetLengths()[0],
|
|
mDesc.GetLengths()[1],
|
|
mDesc.GetLengths()[2],
|
|
mDesc.GetLengths()[3],
|
|
mDesc.GetLengths()[4])(num_thread);
|
|
break;
|
|
}
|
|
default: throw std::runtime_error("unspported dimension");
|
|
}
|
|
}
|
|
|
|
template <typename... Is>
|
|
T& operator()(Is... is)
|
|
{
|
|
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
|
}
|
|
|
|
template <typename... Is>
|
|
const T& operator()(Is... is) const
|
|
{
|
|
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
|
}
|
|
|
|
typename std::vector<T>::iterator begin() { return mData.begin(); }
|
|
|
|
typename std::vector<T>::iterator end() { return mData.end(); }
|
|
|
|
typename std::vector<T>::const_iterator begin() const { return mData.begin(); }
|
|
|
|
typename std::vector<T>::const_iterator end() const { return mData.end(); }
|
|
|
|
HostTensorDescriptor mDesc;
|
|
std::vector<T> mData;
|
|
};
|
|
|
|
template <typename X>
|
|
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
|
|
{
|
|
this->CalculateStrides();
|
|
}
|
|
|
|
template <typename X, typename Y>
|
|
HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
|
|
: mLens(lens), mStrides(strides)
|
|
{
|
|
}
|
|
|
|
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
|
|
|
|
float bf16_to_f32_(ck::bhalf_t src_val);
|
|
|
|
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);
|
|
|
|
template <typename T>
|
|
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
|
|
{
|
|
float error = 0;
|
|
float max_diff = -1;
|
|
float ref_value = 0, result_value = 0;
|
|
|
|
if constexpr(std::is_same<ck::bhalf_t, T>::value)
|
|
{
|
|
for(int i = 0; i < ref.mData.size(); ++i)
|
|
{
|
|
error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
|
|
float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
|
|
if(max_diff < diff)
|
|
{
|
|
max_diff = diff;
|
|
ref_value = bf16_to_f32_(ref.mData[i]);
|
|
result_value = bf16_to_f32_(result.mData[i]);
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for(int i = 0; i < ref.mData.size(); ++i)
|
|
{
|
|
error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
|
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
|
|
if(max_diff < diff)
|
|
{
|
|
max_diff = diff;
|
|
ref_value = ref.mData[i];
|
|
result_value = result.mData[i];
|
|
}
|
|
}
|
|
}
|
|
|
|
std::cout << "error: " << error << std::endl;
|
|
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
|
|
}
|
|
|
|
template <typename T>
|
|
void check_indices(const Tensor<T>& ref, const Tensor<T>& result)
|
|
{
|
|
bool has_error = false;
|
|
int error_count = 0;
|
|
|
|
for(int i = 0; i < ref.mData.size(); ++i)
|
|
{
|
|
if(ref.mData[i] != result.mData[i])
|
|
{
|
|
std::cerr << std::endl
|
|
<< "Indices different at position " << i << " (ref: " << ref.mData[i]
|
|
<< ", result: " << result.mData[i] << ")" << std::endl;
|
|
has_error = true;
|
|
error_count++;
|
|
if(error_count == 20)
|
|
break;
|
|
};
|
|
}
|
|
|
|
if(!has_error)
|
|
std::cout << std::endl << "Indices result is completely acccurate!" << std::endl;
|
|
}
|
|
|
|
#endif
|