mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-19 12:30:16 +00:00
xdlops_v4r4_fwd fp32/fp16 (#34)
* create files for xdlops
* working on blockwise_gemm_xdlops
* add KReduction
* add m/n repeats
* add 2x2 pipeline
* added 128x128 wavegemm
* use StaticBuffer of vector_type
* break vector type to blk_size
* add kpack into xldops_gemm and blockwise_gemm
* abroadcast only
* add fp32 mfma instructions
* adding fp16 mfma
* pack half4_t
* rename kperwave to kpack
* add 32x32x8fp16
* add fp16 mfma
* clean code
* clean code
* V4r4 xdlops kpack (#35)
* add kpack with incorrect results
* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2
* add 1x1 kernel
* add gridwise_gemm_v2 - single_buffer
* enabled dwordx4 for fp16
Co-authored-by: Chao Liu <chao.liu2@amd.com>
* refactor fwd-v4r4-xdlops
* add v4r4-nhwc-xdlop
* improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop
* tweak scheduling in gridwise gemm
* add v4r3 with a single output copy
* init commit: output with slice win
* adding sliceWin
* add multiple repeats pattern
* starting adding bwd-v4r1-xdlops
* use tuple as SrcBuffer
* adding bwd-data v4r1 nhwc xdlops
* fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2()
* fix bug in host bwd-data conv
* initial implementation of bwd-data v4r1 nhwc xdlops
* add launch bound flags
* enable launch bound
* add m/nrepeat=4
* tweak bwd-data v4r1 nhwc xdlops
* added bwd-data v4r1 nhwc xlops with output A and weight B
* add fwd-v4r4 nhwc xdlops, A input, B weight, C output
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: 3835318cc3]
This commit is contained in:
@@ -9,7 +9,7 @@
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
|
||||
template <class Range>
|
||||
template <typename Range>
|
||||
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
|
||||
{
|
||||
bool first = true;
|
||||
@@ -24,12 +24,27 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
|
||||
return os;
|
||||
}
|
||||
|
||||
template <typename T, typename Range>
|
||||
std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
|
||||
{
|
||||
bool first = true;
|
||||
for(auto&& v : range)
|
||||
{
|
||||
if(first)
|
||||
first = false;
|
||||
else
|
||||
os << delim;
|
||||
os << T{v};
|
||||
}
|
||||
return os;
|
||||
}
|
||||
|
||||
typedef enum {
|
||||
Half = 0,
|
||||
Float = 1,
|
||||
} DataType_t;
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct DataType;
|
||||
|
||||
template <>
|
||||
@@ -37,13 +52,13 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
|
||||
{
|
||||
};
|
||||
|
||||
template <class F, class T, std::size_t... Is>
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return f(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
template <typename F, typename T>
|
||||
auto call_f_unpack_args(F f, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
@@ -51,13 +66,13 @@ auto call_f_unpack_args(F f, T args)
|
||||
return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
|
||||
}
|
||||
|
||||
template <class F, class T, std::size_t... Is>
|
||||
template <typename F, typename T, std::size_t... Is>
|
||||
auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
|
||||
{
|
||||
return F(std::get<Is>(args)...);
|
||||
}
|
||||
|
||||
template <class F, class T>
|
||||
template <typename F, typename T>
|
||||
auto construct_f_unpack_args(F, T args)
|
||||
{
|
||||
constexpr std::size_t N = std::tuple_size<T>{};
|
||||
@@ -77,13 +92,13 @@ struct HostTensorDescriptor
|
||||
|
||||
void CalculateStrides();
|
||||
|
||||
template <class Range>
|
||||
template <typename Range>
|
||||
HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
|
||||
{
|
||||
this->CalculateStrides();
|
||||
}
|
||||
|
||||
template <class Range1, class Range2>
|
||||
template <typename Range1, typename Range2>
|
||||
HostTensorDescriptor(const Range1& lens, const Range2& strides)
|
||||
: mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
|
||||
{
|
||||
@@ -96,7 +111,7 @@ struct HostTensorDescriptor
|
||||
const std::vector<std::size_t>& GetLengths() const;
|
||||
const std::vector<std::size_t>& GetStrides() const;
|
||||
|
||||
template <class... Is>
|
||||
template <typename... Is>
|
||||
std::size_t GetOffsetFromMultiIndex(Is... is) const
|
||||
{
|
||||
assert(sizeof...(Is) == this->GetNumOfDimension());
|
||||
@@ -111,7 +126,7 @@ struct HostTensorDescriptor
|
||||
|
||||
struct joinable_thread : std::thread
|
||||
{
|
||||
template <class... Xs>
|
||||
template <typename... Xs>
|
||||
joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
|
||||
{
|
||||
}
|
||||
@@ -126,7 +141,7 @@ struct joinable_thread : std::thread
|
||||
}
|
||||
};
|
||||
|
||||
template <class F, class... Xs>
|
||||
template <typename F, typename... Xs>
|
||||
struct ParallelTensorFunctor
|
||||
{
|
||||
F mF;
|
||||
@@ -180,26 +195,26 @@ struct ParallelTensorFunctor
|
||||
}
|
||||
};
|
||||
|
||||
template <class F, class... Xs>
|
||||
template <typename F, typename... Xs>
|
||||
auto make_ParallelTensorFunctor(F f, Xs... xs)
|
||||
{
|
||||
return ParallelTensorFunctor<F, Xs...>(f, xs...);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct Tensor
|
||||
{
|
||||
template <class X>
|
||||
template <typename X>
|
||||
Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X>
|
||||
template <typename X>
|
||||
Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
}
|
||||
|
||||
template <class X, class Y>
|
||||
template <typename X, typename Y>
|
||||
Tensor(std::vector<X> lens, std::vector<Y> strides)
|
||||
: mDesc(lens, strides), mData(mDesc.GetElementSpace())
|
||||
{
|
||||
@@ -207,7 +222,7 @@ struct Tensor
|
||||
|
||||
Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}
|
||||
|
||||
template <class G>
|
||||
template <typename G>
|
||||
void GenerateTensorValue(G g, std::size_t num_thread = 1)
|
||||
{
|
||||
switch(mDesc.GetNumOfDimension())
|
||||
@@ -247,13 +262,13 @@ struct Tensor
|
||||
}
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
template <typename... Is>
|
||||
T& operator()(Is... is)
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
}
|
||||
|
||||
template <class... Is>
|
||||
template <typename... Is>
|
||||
const T& operator()(Is... is) const
|
||||
{
|
||||
return mData[mDesc.GetOffsetFromMultiIndex(is...)];
|
||||
@@ -285,7 +300,7 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s
|
||||
|
||||
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
|
||||
{
|
||||
float error = 0;
|
||||
|
||||
Reference in New Issue
Block a user