xdlops_v4r4_fwd fp32/fp16 (#34)

* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 3835318cc3]
2026-05-19 12:30:16 +00:00 · 2021-07-01 14:33:00 -05:00
parent 817b2a47c6
commit 67dcc552b6
54 changed files with 9813 additions and 245 deletions
--- a/driver/include/host_tensor.hpp
+++ b/driver/include/host_tensor.hpp
@@ -9,7 +9,7 @@
 #include <cassert>
 #include <iostream>

-template <class Range>
+template <typename Range>
 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
 {
    bool first = true;
@@ -24,12 +24,27 @@ std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
    return os;
 }

+template <typename T, typename Range>
+std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
+{
+    bool first = true;
+    for(auto&& v : range)
+    {
+        if(first)
+            first = false;
+        else
+            os << delim;
+        os << T{v};
+    }
+    return os;
+}
+
 typedef enum {
    Half  = 0,
    Float = 1,
 } DataType_t;

-template <class T>
+template <typename T>
 struct DataType;

 template <>
@@ -37,13 +52,13 @@ struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
 {
 };

-template <class F, class T, std::size_t... Is>
+template <typename F, typename T, std::size_t... Is>
 auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
 {
    return f(std::get<Is>(args)...);
 }

-template <class F, class T>
+template <typename F, typename T>
 auto call_f_unpack_args(F f, T args)
 {
    constexpr std::size_t N = std::tuple_size<T>{};
@@ -51,13 +66,13 @@ auto call_f_unpack_args(F f, T args)
    return call_f_unpack_args_impl(f, args, std::make_index_sequence<N>{});
 }

-template <class F, class T, std::size_t... Is>
+template <typename F, typename T, std::size_t... Is>
 auto construct_f_unpack_args_impl(T args, std::index_sequence<Is...>)
 {
    return F(std::get<Is>(args)...);
 }

-template <class F, class T>
+template <typename F, typename T>
 auto construct_f_unpack_args(F, T args)
 {
    constexpr std::size_t N = std::tuple_size<T>{};
@@ -77,13 +92,13 @@ struct HostTensorDescriptor

    void CalculateStrides();

-    template <class Range>
+    template <typename Range>
    HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
    {
        this->CalculateStrides();
    }

-    template <class Range1, class Range2>
+    template <typename Range1, typename Range2>
    HostTensorDescriptor(const Range1& lens, const Range2& strides)
        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
    {
@@ -96,7 +111,7 @@ struct HostTensorDescriptor
    const std::vector<std::size_t>& GetLengths() const;
    const std::vector<std::size_t>& GetStrides() const;

-    template <class... Is>
+    template <typename... Is>
    std::size_t GetOffsetFromMultiIndex(Is... is) const
    {
        assert(sizeof...(Is) == this->GetNumOfDimension());
@@ -111,7 +126,7 @@ struct HostTensorDescriptor

 struct joinable_thread : std::thread
 {
-    template <class... Xs>
+    template <typename... Xs>
    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...)
    {
    }
@@ -126,7 +141,7 @@ struct joinable_thread : std::thread
    }
 };

-template <class F, class... Xs>
+template <typename F, typename... Xs>
 struct ParallelTensorFunctor
 {
    F mF;
@@ -180,26 +195,26 @@ struct ParallelTensorFunctor
    }
 };

-template <class F, class... Xs>
+template <typename F, typename... Xs>
 auto make_ParallelTensorFunctor(F f, Xs... xs)
 {
    return ParallelTensorFunctor<F, Xs...>(f, xs...);
 }

-template <class T>
+template <typename T>
 struct Tensor
 {
-    template <class X>
+    template <typename X>
    Tensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
    {
    }

-    template <class X>
+    template <typename X>
    Tensor(std::vector<X> lens) : mDesc(lens), mData(mDesc.GetElementSpace())
    {
    }

-    template <class X, class Y>
+    template <typename X, typename Y>
    Tensor(std::vector<X> lens, std::vector<Y> strides)
        : mDesc(lens, strides), mData(mDesc.GetElementSpace())
    {
@@ -207,7 +222,7 @@ struct Tensor

    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}

-    template <class G>
+    template <typename G>
    void GenerateTensorValue(G g, std::size_t num_thread = 1)
    {
        switch(mDesc.GetNumOfDimension())
@@ -247,13 +262,13 @@ struct Tensor
        }
    }

-    template <class... Is>
+    template <typename... Is>
    T& operator()(Is... is)
    {
        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
    }

-    template <class... Is>
+    template <typename... Is>
    const T& operator()(Is... is) const
    {
        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
@@ -285,7 +300,7 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s

 void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);

-template <class T>
+template <typename T>
 void check_error(const Tensor<T>& ref, const Tensor<T>& result)
 {
    float error     = 0;