mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-17 11:30:02 +00:00
xdlops_v4r4_fwd fp32/fp16 (#34)
* create files for xdlops
* working on blockwise_gemm_xdlops
* add KReduction
* add m/n repeats
* add 2x2 pipeline
* added 128x128 wavegemm
* use StaticBuffer of vector_type
* break vector type to blk_size
* add kpack into xldops_gemm and blockwise_gemm
* abroadcast only
* add fp32 mfma instructions
* adding fp16 mfma
* pack half4_t
* rename kperwave to kpack
* add 32x32x8fp16
* add fp16 mfma
* clean code
* clean code
* V4r4 xdlops kpack (#35)
* add kpack with incorrect results
* bug fix for make_dynamic_naive_tensor_descriptor_aligned_v2
* add 1x1 kernel
* add gridwise_gemm_v2 - single_buffer
* enabled dwordx4 for fp16
Co-authored-by: Chao Liu <chao.liu2@amd.com>
* refactor fwd-v4r4-xdlops
* add v4r4-nhwc-xdlop
* improve some perf of nhwc and nchw by tuning parameters, and change scheuduling in gridwise-gemm loop
* tweak scheduling in gridwise gemm
* add v4r3 with a single output copy
* init commit: output with slice win
* adding sliceWin
* add multiple repeats pattern
* starting adding bwd-v4r1-xdlops
* use tuple as SrcBuffer
* adding bwd-data v4r1 nhwc xdlops
* fix bug in make_dynamic_naive_tensor_descriptor_aligned_v2()
* fix bug in host bwd-data conv
* initial implementation of bwd-data v4r1 nhwc xdlops
* add launch bound flags
* enable launch bound
* add m/nrepeat=4
* tweak bwd-data v4r1 nhwc xdlops
* added bwd-data v4r1 nhwc xlops with output A and weight B
* add fwd-v4r4 nhwc xdlops, A input, B weight, C output
Co-authored-by: Chao Liu <chao.liu2@amd.com>
[ROCm/composable_kernel commit: 3835318cc3]
This commit is contained in:
@@ -9,25 +9,25 @@
|
||||
namespace ck {
|
||||
namespace math {
|
||||
|
||||
template <class T, T s>
|
||||
template <typename T, T s>
|
||||
struct scales
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct plus
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct minus
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct multiplies
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
|
||||
@@ -42,83 +42,111 @@ struct multiplies_v2
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct maximize
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a >= b ? a : b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct minimize
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a <= b ? a : b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct integer_divide_ceiler
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const
|
||||
{
|
||||
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
|
||||
|
||||
return (a + b - 1) / b;
|
||||
return (a + b - Number<1>{}) / b;
|
||||
}
|
||||
};
|
||||
|
||||
template <class X, class Y>
|
||||
template <typename X, typename Y>
|
||||
__host__ __device__ constexpr auto integer_divide_floor(X x, Y y)
|
||||
{
|
||||
return x / y;
|
||||
}
|
||||
|
||||
template <class X, class Y>
|
||||
template <typename X, typename Y>
|
||||
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
|
||||
{
|
||||
return (x + y - Number<1>{}) / y;
|
||||
}
|
||||
|
||||
template <class X, class Y>
|
||||
template <typename X, typename Y>
|
||||
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
|
||||
{
|
||||
return y * integer_divide_ceil(x, y);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr T max(T x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T max(T x, Ts... xs)
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr T max(T x, T y)
|
||||
{
|
||||
static_assert(sizeof...(xs) > 0, "not enough argument");
|
||||
|
||||
auto y = max(xs...);
|
||||
|
||||
static_assert(is_same<decltype(y), T>{}, "not the same type");
|
||||
|
||||
return x > y ? x : y;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <index_t X>
|
||||
__host__ __device__ constexpr index_t max(Number<X>, index_t y)
|
||||
{
|
||||
return X > y ? X : y;
|
||||
}
|
||||
|
||||
template <index_t Y>
|
||||
__host__ __device__ constexpr index_t max(index_t x, Number<Y>)
|
||||
{
|
||||
return x > Y ? x : Y;
|
||||
}
|
||||
|
||||
template <typename X, typename... Ys>
|
||||
__host__ __device__ constexpr auto max(X x, Ys... ys)
|
||||
{
|
||||
static_assert(sizeof...(Ys) > 0, "not enough argument");
|
||||
|
||||
return max(x, max(ys...));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr T min(T x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T min(T x, Ts... xs)
|
||||
template <typename T>
|
||||
__host__ __device__ constexpr T min(T x, T y)
|
||||
{
|
||||
static_assert(sizeof...(xs) > 0, "not enough argument");
|
||||
|
||||
auto y = min(xs...);
|
||||
|
||||
static_assert(is_same<decltype(y), T>{}, "not the same type");
|
||||
|
||||
return x < y ? x : y;
|
||||
}
|
||||
|
||||
template <index_t X>
|
||||
__host__ __device__ constexpr index_t min(Number<X>, index_t y)
|
||||
{
|
||||
return X < y ? X : y;
|
||||
}
|
||||
|
||||
template <index_t Y>
|
||||
__host__ __device__ constexpr index_t min(index_t x, Number<Y>)
|
||||
{
|
||||
return x < Y ? x : Y;
|
||||
}
|
||||
|
||||
template <typename X, typename... Ys>
|
||||
__host__ __device__ constexpr auto min(X x, Ys... ys)
|
||||
{
|
||||
static_assert(sizeof...(Ys) > 0, "not enough argument");
|
||||
|
||||
return min(x, min(ys...));
|
||||
}
|
||||
|
||||
// greatest common divisor, aka highest common factor
|
||||
__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
|
||||
{
|
||||
@@ -171,13 +199,13 @@ __host__ __device__ constexpr auto lcm(X x, Ys... ys)
|
||||
return lcm(x, lcm(ys...));
|
||||
}
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct equal
|
||||
{
|
||||
__host__ __device__ constexpr bool operator()(T x, T y) const { return x == y; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
template <typename T>
|
||||
struct less
|
||||
{
|
||||
__host__ __device__ constexpr bool operator()(T x, T y) const { return x < y; }
|
||||
|
||||
Reference in New Issue
Block a user