mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
refactor
This commit is contained in:
@@ -54,9 +54,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0,
|
||||
"wrong! Cannot evenly divide work among\n");
|
||||
|
||||
static_assert(std::is_same<decltype(ThreadMatrixC::GetLengths()),
|
||||
decltype(GetThreadMatrixCLengths())>{},
|
||||
"wrong! ThreadMatrixC lengths is wrong");
|
||||
static_assert(
|
||||
is_same<decltype(ThreadMatrixC::GetLengths()), decltype(GetThreadMatrixCLengths())>{},
|
||||
"wrong! ThreadMatrixC lengths is wrong");
|
||||
|
||||
auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id());
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
namespace ck {
|
||||
|
||||
// cast a pointer of LDS to its address
|
||||
extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
|
||||
extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p);
|
||||
|
||||
__device__ void vmcnt(index_t cnt)
|
||||
{
|
||||
|
||||
@@ -1,18 +1,15 @@
|
||||
#ifndef CK_COMMON_HPP
|
||||
#define CK_COMMON_HPP
|
||||
#ifndef CK_COMMON_HEADER_HPP
|
||||
#define CK_COMMON_HEADER_HPP
|
||||
|
||||
#include "config.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
#include "math.hpp"
|
||||
#include "utility.hpp"
|
||||
#include "vector_type.hpp"
|
||||
#include "integral_constant.hpp"
|
||||
#include "Sequence.hpp"
|
||||
#include "Array.hpp"
|
||||
#include "functional.hpp"
|
||||
#include "functional2.hpp"
|
||||
#include "functional3.hpp"
|
||||
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
#include "amd_inline_asm.hpp"
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
@@ -1,12 +1,11 @@
|
||||
#ifndef CK_CONFIG_AMD_HPP
|
||||
#define CK_CONFIG_AMD_HPP
|
||||
|
||||
#cmakedefine01 CK_DEVICE_BACKEND_AMD
|
||||
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
|
||||
#define CK_DEVICE_BACKEND_AMD 1
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
|
||||
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
#ifndef CK_CONFIG_NVIDIA_HPP
|
||||
#define CK_CONFIG_NVIDIA_HPP
|
||||
|
||||
#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA
|
||||
|
||||
#include "cuda_runtime.h"
|
||||
#include "cuda_fp16.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
#define CK_USE_AMD_INLINE_ASM 0
|
||||
|
||||
#define CK_DEVICE_BACKEND_NVIDIA 1
|
||||
#define CK_USE_AMD_INLINE_ASM 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0
|
||||
#define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0
|
||||
|
||||
@@ -23,10 +22,6 @@ using float4_t = float4;
|
||||
|
||||
using index_t = uint32_t;
|
||||
|
||||
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
|
||||
|
||||
__device__ index_t get_block_1d_id() { return blockIdx.x; }
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
|
||||
{
|
||||
d += s0 * s1;
|
||||
@@ -52,9 +47,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s
|
||||
// need to make a better interface
|
||||
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
|
||||
{
|
||||
#if CK_DEVICE_BACKEND_NVIDIA
|
||||
d = __dp4a(s0, s1, d);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,12 +1,20 @@
|
||||
#ifndef CK_INTEGRAL_CONSTANT_HPP
|
||||
#define CK_INTEGRAL_CONSTANT_HPP
|
||||
|
||||
#include <type_traits>
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class T, T v>
|
||||
using integral_constant = std::integral_constant<T, v>;
|
||||
struct integral_constant
|
||||
{
|
||||
static constexpr T value = v;
|
||||
typedef T value_type;
|
||||
typedef integral_constant type; // using injected-class-name
|
||||
__host__ __device__ constexpr operator value_type() const noexcept { return value; }
|
||||
__host__ __device__ constexpr value_type operator()() const noexcept
|
||||
{
|
||||
return value;
|
||||
} // since c++14
|
||||
};
|
||||
|
||||
template <class T, T X, T Y>
|
||||
__host__ __device__ constexpr auto operator+(integral_constant<T, X>, integral_constant<T, Y>)
|
||||
@@ -23,5 +31,15 @@ __host__ __device__ constexpr auto operator*(integral_constant<T, X>, integral_c
|
||||
template <index_t N>
|
||||
using Number = integral_constant<index_t, N>;
|
||||
|
||||
template <class X, class Y>
|
||||
struct is_same : public integral_constant<bool, false>
|
||||
{
|
||||
};
|
||||
|
||||
template <class X>
|
||||
struct is_same<X, X> : public integral_constant<bool, true>
|
||||
{
|
||||
};
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
|
||||
107
composable_kernel/include/utility/math.hpp
Normal file
107
composable_kernel/include/utility/math.hpp
Normal file
@@ -0,0 +1,107 @@
|
||||
#ifndef CK_MATH_HPP
|
||||
#define CK_MATH_HPP
|
||||
|
||||
#include "config.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace math {
|
||||
|
||||
template <class T, T s>
|
||||
struct scales
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct plus
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct minus
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct multiplies
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct integer_divide_ceiler
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const
|
||||
{
|
||||
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
|
||||
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
|
||||
{
|
||||
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
|
||||
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T integer_least_multiple(T a, T b)
|
||||
{
|
||||
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
|
||||
|
||||
return b * integer_divide_ceil(a, b);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T max(T x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T max(T x, Ts... xs)
|
||||
{
|
||||
static_assert(sizeof...(xs) > 0, "not enough argument");
|
||||
|
||||
auto y = max(xs...);
|
||||
|
||||
static_assert(is_same<decltype(y), T>{}, "not the same type");
|
||||
|
||||
return x > y ? x : y;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T min(T x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T min(T x, Ts... xs)
|
||||
{
|
||||
static_assert(sizeof...(xs) > 0, "not enough argument");
|
||||
|
||||
auto y = min(xs...);
|
||||
|
||||
static_assert(is_same<decltype(y), T>{}, "not the same type");
|
||||
|
||||
return x < y ? x : y;
|
||||
}
|
||||
|
||||
// this is WRONG
|
||||
// TODO: implement least common multiple properly, instead of calling max()
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T lcm(T x, Ts... xs)
|
||||
{
|
||||
return max(x, xs...);
|
||||
}
|
||||
|
||||
} // namespace math
|
||||
} // namspace ck
|
||||
|
||||
#endif
|
||||
@@ -1,112 +1,14 @@
|
||||
#ifndef CK_UTILITY_HPP
|
||||
#define CK_UTILITY_HPP
|
||||
|
||||
#include <type_traits>
|
||||
#include "config.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <class X, class Y>
|
||||
using is_same = std::is_same<X, Y>;
|
||||
__device__ index_t get_thread_local_1d_id() { return threadIdx.x; }
|
||||
|
||||
namespace math {
|
||||
__device__ index_t get_block_1d_id() { return blockIdx.x; }
|
||||
|
||||
template <class T, T s>
|
||||
struct scales
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a) const { return s * a; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct plus
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a + b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct minus
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a - b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct multiplies
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const { return a * b; }
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct integer_divide_ceiler
|
||||
{
|
||||
__host__ __device__ constexpr T operator()(T a, T b) const
|
||||
{
|
||||
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
|
||||
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T integer_divide_ceil(T a, T b)
|
||||
{
|
||||
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
|
||||
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T integer_least_multiple(T a, T b)
|
||||
{
|
||||
static_assert(is_same<T, index_t>{} || is_same<T, int>{}, "wrong type");
|
||||
|
||||
return b * integer_divide_ceil(a, b);
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T max(T x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T max(T x, Ts... xs)
|
||||
{
|
||||
static_assert(sizeof...(xs) > 0, "not enough argument");
|
||||
|
||||
auto y = max(xs...);
|
||||
|
||||
static_assert(is_same<decltype(y), T>{}, "not the same type");
|
||||
|
||||
return x > y ? x : y;
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__host__ __device__ constexpr T min(T x)
|
||||
{
|
||||
return x;
|
||||
}
|
||||
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T min(T x, Ts... xs)
|
||||
{
|
||||
static_assert(sizeof...(xs) > 0, "not enough argument");
|
||||
|
||||
auto y = min(xs...);
|
||||
|
||||
static_assert(is_same<decltype(y), T>{}, "not the same type");
|
||||
|
||||
return x < y ? x : y;
|
||||
}
|
||||
|
||||
// this is WRONG
|
||||
// TODO: implement least common multiple properly, instead of calling max()
|
||||
template <class T, class... Ts>
|
||||
__host__ __device__ constexpr T lcm(T x, Ts... xs)
|
||||
{
|
||||
return max(x, xs...);
|
||||
}
|
||||
|
||||
} // namespace math
|
||||
} // namspace ck
|
||||
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user