diff --git a/CMakeLists.txt b/CMakeLists.txt index 55727cc7c9..9798220caa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,13 +33,9 @@ endif( NOT( ${CMAKE_CXX_COMPILER_ID} STREQUAL "AppleClang") ) #GPU backend if(DEVICE_BACKEND STREQUAL "AMD") - set(CK_DEVICE_BACKEND_AMD 1) - set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH}) find_package(HIP REQUIRED) elseif(DEVICE_BACKEND STREQUAL "NVIDIA") - set(CK_DEVICE_BACKEND_NVIDIA 1) - enable_language(CUDA) include_directories(BEFORE ${CUDA_COMMON_INCLUDE_DIR}) endif() diff --git a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp index 2052747c65..923e8c7209 100644 --- a/composable_kernel/include/tensor_operation/blockwise_gemm.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_gemm.hpp @@ -54,9 +54,9 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 N % (NPerThreadSubC * NLevel0Cluster * NLevel1Cluster) == 0, "wrong! Cannot evenly divide work among\n"); - static_assert(std::is_same{}, - "wrong! ThreadMatrixC lengths is wrong"); + static_assert( + is_same{}, + "wrong! ThreadMatrixC lengths is wrong"); auto c_thread_mtx_index = GetBeginOfThreadMatrixC(get_thread_local_1d_id()); diff --git a/composable_kernel/include/utility/amd_inline_asm.hpp b/composable_kernel/include/utility/amd_inline_asm.hpp index f973a41b0d..e7e20808e1 100644 --- a/composable_kernel/include/utility/amd_inline_asm.hpp +++ b/composable_kernel/include/utility/amd_inline_asm.hpp @@ -6,7 +6,7 @@ namespace ck { // cast a pointer of LDS to its address -extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]]; +extern "C" __attribute__((address_space(3))) __device__ void* __to_local(void* p); __device__ void vmcnt(index_t cnt) { diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index 1c8dcbd521..339666ff91 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -1,18 +1,15 @@ -#ifndef CK_COMMON_HPP -#define CK_COMMON_HPP +#ifndef CK_COMMON_HEADER_HPP +#define CK_COMMON_HEADER_HPP #include "config.hpp" +#include "integral_constant.hpp" +#include "math.hpp" #include "utility.hpp" #include "vector_type.hpp" -#include "integral_constant.hpp" #include "Sequence.hpp" #include "Array.hpp" #include "functional.hpp" #include "functional2.hpp" #include "functional3.hpp" -#if CK_USE_AMD_INLINE_ASM -#include "amd_inline_asm.hpp" -#endif - #endif diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index ea8a3356d6..f9ae2e7830 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -1,12 +1,11 @@ #ifndef CK_CONFIG_AMD_HPP #define CK_CONFIG_AMD_HPP -#cmakedefine01 CK_DEVICE_BACKEND_AMD - #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" -#define CK_USE_AMD_INLINE_ASM 1 +#define CK_DEVICE_BACKEND_AMD 1 +#define CK_USE_AMD_INLINE_ASM 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 038448e9d1..f077db991c 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -1,14 +1,13 @@ #ifndef CK_CONFIG_NVIDIA_HPP #define CK_CONFIG_NVIDIA_HPP -#cmakedefine01 CK_DEVICE_BACKEND_NVIDIA - #include "cuda_runtime.h" #include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" -#define CK_USE_AMD_INLINE_ASM 0 +#define CK_DEVICE_BACKEND_NVIDIA 1 +#define CK_USE_AMD_INLINE_ASM 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1 0 @@ -23,10 +22,6 @@ using float4_t = float4; using index_t = uint32_t; -__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } - -__device__ index_t get_block_1d_id() { return blockIdx.x; } - __device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) { d += s0 * s1; @@ -52,9 +47,7 @@ __device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s // need to make a better interface __device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) { -#if CK_DEVICE_BACKEND_NVIDIA d = __dp4a(s0, s1, d); -#endif } #endif diff --git a/composable_kernel/include/utility/integral_constant.hpp b/composable_kernel/include/utility/integral_constant.hpp index 6038b1b203..b83d7843d5 100644 --- a/composable_kernel/include/utility/integral_constant.hpp +++ b/composable_kernel/include/utility/integral_constant.hpp @@ -1,12 +1,20 @@ #ifndef CK_INTEGRAL_CONSTANT_HPP #define CK_INTEGRAL_CONSTANT_HPP -#include - namespace ck { template -using integral_constant = std::integral_constant; +struct integral_constant +{ + static constexpr T value = v; + typedef T value_type; + typedef integral_constant type; // using injected-class-name + __host__ __device__ constexpr operator value_type() const noexcept { return value; } + __host__ __device__ constexpr value_type operator()() const noexcept + { + return value; + } // since c++14 +}; template __host__ __device__ constexpr auto operator+(integral_constant, integral_constant) @@ -23,5 +31,15 @@ __host__ __device__ constexpr auto operator*(integral_constant, integral_c template using Number = integral_constant; +template +struct is_same : public integral_constant +{ +}; + +template +struct is_same : public integral_constant +{ +}; + } // namespace ck #endif diff --git a/composable_kernel/include/utility/math.hpp b/composable_kernel/include/utility/math.hpp new file mode 100644 index 0000000000..b754ca173c --- /dev/null +++ b/composable_kernel/include/utility/math.hpp @@ -0,0 +1,107 @@ +#ifndef CK_MATH_HPP +#define CK_MATH_HPP + +#include "config.hpp" + +namespace ck { +namespace math { + +template +struct scales +{ + __host__ __device__ constexpr T operator()(T a) const { return s * a; } +}; + +template +struct plus +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; } +}; + +template +struct minus +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; } +}; + +template +struct multiplies +{ + __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; } +}; + +template +struct integer_divide_ceiler +{ + __host__ __device__ constexpr T operator()(T a, T b) const + { + static_assert(is_same{} || is_same{}, "wrong type"); + + return (a + b - 1) / b; + } +}; + +template +__host__ __device__ constexpr T integer_divide_ceil(T a, T b) +{ + static_assert(is_same{} || is_same{}, "wrong type"); + + return (a + b - 1) / b; +} + +template +__host__ __device__ constexpr T integer_least_multiple(T a, T b) +{ + static_assert(is_same{} || is_same{}, "wrong type"); + + return b * integer_divide_ceil(a, b); +} + +template +__host__ __device__ constexpr T max(T x) +{ + return x; +} + +template +__host__ __device__ constexpr T max(T x, Ts... xs) +{ + static_assert(sizeof...(xs) > 0, "not enough argument"); + + auto y = max(xs...); + + static_assert(is_same{}, "not the same type"); + + return x > y ? x : y; +} + +template +__host__ __device__ constexpr T min(T x) +{ + return x; +} + +template +__host__ __device__ constexpr T min(T x, Ts... xs) +{ + static_assert(sizeof...(xs) > 0, "not enough argument"); + + auto y = min(xs...); + + static_assert(is_same{}, "not the same type"); + + return x < y ? x : y; +} + +// this is WRONG +// TODO: implement least common multiple properly, instead of calling max() +template +__host__ __device__ constexpr T lcm(T x, Ts... xs) +{ + return max(x, xs...); +} + +} // namespace math +} // namspace ck + +#endif diff --git a/composable_kernel/include/utility/utility.hpp b/composable_kernel/include/utility/utility.hpp index e873881561..4cb2daa7c8 100644 --- a/composable_kernel/include/utility/utility.hpp +++ b/composable_kernel/include/utility/utility.hpp @@ -1,112 +1,14 @@ #ifndef CK_UTILITY_HPP #define CK_UTILITY_HPP -#include #include "config.hpp" namespace ck { -template -using is_same = std::is_same; +__device__ index_t get_thread_local_1d_id() { return threadIdx.x; } -namespace math { +__device__ index_t get_block_1d_id() { return blockIdx.x; } -template -struct scales -{ - __host__ __device__ constexpr T operator()(T a) const { return s * a; } -}; - -template -struct plus -{ - __host__ __device__ constexpr T operator()(T a, T b) const { return a + b; } -}; - -template -struct minus -{ - __host__ __device__ constexpr T operator()(T a, T b) const { return a - b; } -}; - -template -struct multiplies -{ - __host__ __device__ constexpr T operator()(T a, T b) const { return a * b; } -}; - -template -struct integer_divide_ceiler -{ - __host__ __device__ constexpr T operator()(T a, T b) const - { - static_assert(is_same{} || is_same{}, "wrong type"); - - return (a + b - 1) / b; - } -}; - -template -__host__ __device__ constexpr T integer_divide_ceil(T a, T b) -{ - static_assert(is_same{} || is_same{}, "wrong type"); - - return (a + b - 1) / b; -} - -template -__host__ __device__ constexpr T integer_least_multiple(T a, T b) -{ - static_assert(is_same{} || is_same{}, "wrong type"); - - return b * integer_divide_ceil(a, b); -} - -template -__host__ __device__ constexpr T max(T x) -{ - return x; -} - -template -__host__ __device__ constexpr T max(T x, Ts... xs) -{ - static_assert(sizeof...(xs) > 0, "not enough argument"); - - auto y = max(xs...); - - static_assert(is_same{}, "not the same type"); - - return x > y ? x : y; -} - -template -__host__ __device__ constexpr T min(T x) -{ - return x; -} - -template -__host__ __device__ constexpr T min(T x, Ts... xs) -{ - static_assert(sizeof...(xs) > 0, "not enough argument"); - - auto y = min(xs...); - - static_assert(is_same{}, "not the same type"); - - return x < y ? x : y; -} - -// this is WRONG -// TODO: implement least common multiple properly, instead of calling max() -template -__host__ __device__ constexpr T lcm(T x, Ts... xs) -{ - return max(x, xs...); -} - -} // namespace math } // namspace ck #endif