diff --git a/src/include/ConstantMatrixDescriptor.hip.hpp b/src/include/ConstantMatrixDescriptor.hip.hpp index 9cacf27553..c6ca3192e3 100644 --- a/src/include/ConstantMatrixDescriptor.hip.hpp +++ b/src/include/ConstantMatrixDescriptor.hip.hpp @@ -23,11 +23,7 @@ struct ConstantMatrixDescriptor __host__ __device__ index_t Get1dIndex(index_t irow, index_t icol) const { -#if DEVICE_BACKEND_HIP - return __mul24(irow, RowStride_) + icol; -#else return irow * RowStride_ + icol; -#endif } template diff --git a/src/include/blockwise_2d_tensor_op.hip.hpp b/src/include/blockwise_2d_tensor_op.hip.hpp index 8d457056a2..a5c8f1ea9b 100644 --- a/src/include/blockwise_2d_tensor_op.hip.hpp +++ b/src/include/blockwise_2d_tensor_op.hip.hpp @@ -668,7 +668,7 @@ struct Blockwise2dTensorCopy3 } } -#if DEVICE_BACKEND_HIP +#if USE_AMD_INLINE_ASM __device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src, Float* p_clipboard) const { diff --git a/src/include/blockwise_batched_gemm.hip.hpp b/src/include/blockwise_batched_gemm.hip.hpp index b50acfe0d4..98f36011f3 100644 --- a/src/include/blockwise_batched_gemm.hip.hpp +++ b/src/include/blockwise_batched_gemm.hip.hpp @@ -283,7 +283,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2 } } -#if DEVICE_BACKEND_HIP +#if USE_AMD_INLINE_ASM template __device__ void Run_asm(const FloatA* __restrict__ p_a_block, const FloatB* __restrict__ p_b_block, diff --git a/src/include/blockwise_gemm.hip.hpp b/src/include/blockwise_gemm.hip.hpp index 4870227f7e..582f45f7ea 100644 --- a/src/include/blockwise_gemm.hip.hpp +++ b/src/include/blockwise_gemm.hip.hpp @@ -126,7 +126,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2 n_repeat * NPerLevel1Cluster + n_in_sub_c}; } -#if DEVICE_BACKEND_HIP +#if USE_AMD_INLINE_ASM // TODO: this is not working correctly template __device__ void Run_asm(const FloatA* __restrict__ p_a_block, diff --git a/src/include/common.hip.hpp b/src/include/common.hip.hpp index 5f99872d8e..bf7249ca70 100644 --- a/src/include/common.hip.hpp +++ b/src/include/common.hip.hpp @@ -1,12 +1,12 @@ #pragma once -#include "data_type.hip.hpp" +#include "vector_type.hip.hpp" #include "constant_integral.hip.hpp" #include "Sequence.hip.hpp" #include "Array.hip.hpp" #include "functional.hip.hpp" #include "functional2.hip.hpp" -#if DEVICE_BACKEND_HIP +#if USE_AMD_INLINE_ASM #include "amd_inline_asm.hip.hpp" #endif diff --git a/src/include/config.h.in b/src/include/config.h.in index ce3232489d..c97d71def2 100644 --- a/src/include/config.h.in +++ b/src/include/config.h.in @@ -5,11 +5,56 @@ #if DEVICE_BACKEND_HIP #include "hip/hip_runtime.h" #include "hip/hip_fp16.h" +#define USE_AMD_INLINE_ASM 1 + +// For some reason, HIP compiler need this definition to generate optimal load and store +// instruction +typedef float float2_t __attribute__((ext_vector_type(2))); +typedef float float4_t __attribute__((ext_vector_type(4))); #elif DEVICE_BACKEND_CUDA #include "cuda_runtime.h" #include "cuda_fp16.h" #include "nvToolsExt.h" #include "helper_cuda.h" +#define USE_AMD_INLINE_ASM 0 + +// For some reason, CUDA need this definition, otherwise +// compiler won't generate optimal load and store instruction, and +// kernel would produce wrong result, indicating the compiler fail to generate correct +// instruction, +using float2_t = float2; +using float4_t = float4; #endif using index_t = uint32_t; + +__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) +{ + d += s0 * s1; +} + +#if 0 +__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } + +__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) +{ + d += s0.x * s1.x; + d += s0.y * s1.y; +} + +__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) +{ + d += s0.x * s1.x + s0.y * s1.y; +} + +__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } + +// TODO:: this interface is misleading, s0, s1 are actually int8x4 +// need to make a better interface +__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) +{ +#if DEVICE_BACKEND_CUDA + d = __dp4a(s0, s1, d); +#endif +} +#endif diff --git a/src/include/data_type.hip.hpp b/src/include/vector_type.hip.hpp similarity index 59% rename from src/include/data_type.hip.hpp rename to src/include/vector_type.hip.hpp index b341609a39..d2d3db92ec 100644 --- a/src/include/data_type.hip.hpp +++ b/src/include/vector_type.hip.hpp @@ -23,17 +23,7 @@ struct vector_type template <> struct vector_type { -#if DEVICE_BACKEND_HIP - // For some reason, HIP compiler need this definition to generate optimal load and store - // instruction - typedef float MemoryType __attribute__((ext_vector_type(2))); -#elif DEVICE_BACKEND_CUDA - // For some reason, CUDA need this definition, otherwise - // compiler won't generate optimal load and store instruction, and - // kernel would produce wrong result, indicating the compiler fail to generate correct - // instruction, - using MemoryType = float2; -#endif + using MemoryType = float2_t; union Data { @@ -60,17 +50,7 @@ struct vector_type template <> struct vector_type { -#if DEVICE_BACKEND_HIP - // For some reason, HIP compiler need this definition to generate optimal load and store - // instruction - typedef float MemoryType __attribute__((ext_vector_type(4))); -#elif DEVICE_BACKEND_CUDA - // For some reason, CUDA need this definition, otherwise - // compiler won't generate optimal load and store instruction, and - // kernel would produce wrong result, indicating the compiler fail to generate correct - // instruction, - using MemoryType = float4; -#endif + using MemoryType = float4_t; template __host__ __device__ static void SetScalar(MemoryType& v, float s, Number) @@ -204,48 +184,3 @@ struct vector_type using MemoryType = int64_t; }; #endif - -__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1) -{ - d += s0 * s1; -} - -__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1) -{ - d += s0.x * s1.x; - d += s0.y * s1.y; -} - -__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1) -{ - d += s0.x * s1.x; - d += s0.y * s1.y; - d += s0.z * s1.z; - d += s0.w * s1.w; -} - -#if 0 -__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; } - -__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1) -{ - d += s0.x * s1.x; - d += s0.y * s1.y; -} - -__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1) -{ - d += s0.x * s1.x + s0.y * s1.y; -} - -__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; } - -// TODO:: this interface is misleading, s0, s1 are actually int8x4 -// need to make a better interface -__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1) -{ -#if DEVICE_BACKEND_CUDA - d = __dp4a(s0, s1, d); -#endif -} -#endif