xdlops_v4r4_fwd fp32/fp16 (#34)

* create files for xdlops * working on blockwise_gemm_xdlops * add KReduction * add m/n repeats * add 2x2 pipeline * added 128x128 wavegemm * use StaticBuffer of vector_type * break vector type to blk_size * add kpack into xldops_gemm and blockwise_gemm * abroadcast only * add fp32 mfma instructions * adding fp16 mfma * pack half4_t * rename kperwave to kpack * add 32x32x8fp16 * add fp16 mfma * clean code * clean code * V4r4 xdlops kpack (#35) * add kpack with incorrect results * bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2 * add 1x1 kernel * add gridwise_gemm_v2 - single_buffer * enabled dwordx4 for fp16 Co-authored-by: Chao Liu <chao.liu2@amd.com> * refactor fwd-v4r4-xdlops * add v4r4-nhwc-xdlop * improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop * tweak scheduling in gridwise gemm * add v4r3 with a single output copy * init commit: output with slice win * adding sliceWin * add multiple repeats pattern * starting adding bwd-v4r1-xdlops * use tuple as SrcBuffer * adding bwd-data v4r1 nhwc xdlops * fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2() * fix bug in host bwd-data conv * initial implementation of bwd-data v4r1 nhwc xdlops * add launch bound flags * enable launch bound * add m/nrepeat=4 * tweak bwd-data v4r1 nhwc xdlops * added bwd-data v4r1 nhwc xlops with output A and weight B * add fwd-v4r4 nhwc xdlops, A input, B weight, C output Co-authored-by: Chao Liu <chao.liu2@amd.com> [ROCm/composable_kernel commit: 3835318cc3]
2026-05-17 11:30:02 +00:00 · 2021-07-01 14:33:00 -05:00
parent 817b2a47c6
commit 67dcc552b6
54 changed files with 9813 additions and 245 deletions
--- a/composable_kernel/include/utility/math.hpp
+++ b/composable_kernel/include/utility/math.hpp
@@ -9,25 +9,25 @@
 namespace ck {
 namespace math {

-template <class T, T s>
+template <typename T, T s>
 struct scales
 {
    __host__ __device__ constexpr T operator()(T a) const { return s * a; }
 };

-template <class T>
+template <typename T>
 struct plus
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
 };

-template <class T>
+template <typename T>
 struct minus
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
 };

-template <class T>
+template <typename T>
 struct multiplies
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
@@ -42,83 +42,111 @@ struct multiplies_v2
    }
 };

-template <class T>
+template <typename T>
 struct maximize
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a >= b ? a : b; }
 };

-template <class T>
+template <typename T>
 struct minimize
 {
    __host__ __device__ constexpr T operator()(T a, T b) const { return a <= b ? a : b; }
 };

-template <class T>
+template <typename T>
 struct integer_divide_ceiler
 {
    __host__ __device__ constexpr T operator()(T a, T b) const
    {
        static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");

-        return (a + b - 1) / b;
+        return (a + b - Number<1>{}) / b;
    }
 };

-template <class X, class Y>
+template <typename X, typename Y>
 __host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
 {
    return x / y;
 }

-template <class X, class Y>
+template <typename X, typename Y>
 __host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
 {
    return (x + y - Number<1>{}) / y;
 }

-template <class X, class Y>
+template <typename X, typename Y>
 __host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
 {
    return y * integer_divide_ceil(x, y);
 }

-template <class T>
+template <typename T>
 __host__ __device__ constexpr T max(T x)
 {
    return x;
 }

-template <class T, class... Ts>
-__host__ __device__ constexpr T max(T x, Ts... xs)
+template <typename T>
+__host__ __device__ constexpr T max(T x, T y)
 {
-    static_assert(sizeof...(xs) > 0, "not enough argument");
-
-    auto y = max(xs...);
-
-    static_assert(is_same<decltype(y), T>{}, "not the same type");
-
    return x > y ? x : y;
 }

-template <class T>
+template <index_t X>
+__host__ __device__ constexpr index_t max(Number<X>, index_t y)
+{
+    return X > y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t max(index_t x, Number<Y>)
+{
+    return x > Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto max(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return max(x, max(ys...));
+}
+
+template <typename T>
 __host__ __device__ constexpr T min(T x)
 {
    return x;
 }

-template <class T, class... Ts>
-__host__ __device__ constexpr T min(T x, Ts... xs)
+template <typename T>
+__host__ __device__ constexpr T min(T x, T y)
 {
-    static_assert(sizeof...(xs) > 0, "not enough argument");
-
-    auto y = min(xs...);
-
-    static_assert(is_same<decltype(y), T>{}, "not the same type");
-
    return x < y ? x : y;
 }

+template <index_t X>
+__host__ __device__ constexpr index_t min(Number<X>, index_t y)
+{
+    return X < y ? X : y;
+}
+
+template <index_t Y>
+__host__ __device__ constexpr index_t min(index_t x, Number<Y>)
+{
+    return x < Y ? x : Y;
+}
+
+template <typename X, typename... Ys>
+__host__ __device__ constexpr auto min(X x, Ys... ys)
+{
+    static_assert(sizeof...(Ys) > 0, "not enough argument");
+
+    return min(x, min(ys...));
+}
+
 // greatest common divisor, aka highest common factor
 __host__ __device__ constexpr index_t gcd(index_t x, index_t y)
 {
@@ -171,13 +199,13 @@ __host__ __device__ constexpr auto lcm(X x, Ys... ys)
    return lcm(x, lcm(ys...));
 }

-template <class T>
+template <typename T>
 struct equal
 {
    __host__ __device__ constexpr bool operator()(T x, T y) const { return x == y; }
 };

-template <class T>
+template <typename T>
 struct less
 {
    __host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }