mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
refactored
This commit is contained in:
@@ -23,11 +23,7 @@ struct ConstantMatrixDescriptor
|
||||
|
||||
__host__ __device__ index_t Get1dIndex(index_t irow, index_t icol) const
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
return __mul24(irow, RowStride_) + icol;
|
||||
#else
|
||||
return irow * RowStride_ + icol;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <index_t SubNRow, index_t SubNCol>
|
||||
|
||||
@@ -668,7 +668,7 @@ struct Blockwise2dTensorCopy3
|
||||
}
|
||||
}
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if USE_AMD_INLINE_ASM
|
||||
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
|
||||
Float* p_clipboard) const
|
||||
{
|
||||
|
||||
@@ -283,7 +283,7 @@ struct BlockwiseBatchGemmBlockABlockBThreadCTransANormalBNormalC_V2
|
||||
}
|
||||
}
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if USE_AMD_INLINE_ASM
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run_asm(const FloatA* __restrict__ p_a_block,
|
||||
const FloatB* __restrict__ p_b_block,
|
||||
|
||||
@@ -126,7 +126,7 @@ struct BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2
|
||||
n_repeat * NPerLevel1Cluster + n_in_sub_c};
|
||||
}
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if USE_AMD_INLINE_ASM
|
||||
// TODO: this is not working correctly
|
||||
template <class FloatA, class FloatB, class FloatC>
|
||||
__device__ void Run_asm(const FloatA* __restrict__ p_a_block,
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
#pragma once
|
||||
#include "data_type.hip.hpp"
|
||||
#include "vector_type.hip.hpp"
|
||||
#include "constant_integral.hip.hpp"
|
||||
#include "Sequence.hip.hpp"
|
||||
#include "Array.hip.hpp"
|
||||
#include "functional.hip.hpp"
|
||||
#include "functional2.hip.hpp"
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#if USE_AMD_INLINE_ASM
|
||||
#include "amd_inline_asm.hip.hpp"
|
||||
#endif
|
||||
|
||||
|
||||
@@ -5,11 +5,56 @@
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "hip/hip_fp16.h"
|
||||
#define USE_AMD_INLINE_ASM 1
|
||||
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float float2_t __attribute__((ext_vector_type(2)));
|
||||
typedef float float4_t __attribute__((ext_vector_type(4)));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#include "cuda_runtime.h"
|
||||
#include "cuda_fp16.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
#define USE_AMD_INLINE_ASM 0
|
||||
|
||||
// For some reason, CUDA need this definition, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
// instruction,
|
||||
using float2_t = float2;
|
||||
using float4_t = float4;
|
||||
#endif
|
||||
|
||||
using index_t = uint32_t;
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
|
||||
{
|
||||
d += s0 * s1;
|
||||
}
|
||||
|
||||
#if 0
|
||||
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
|
||||
|
||||
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x + s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
|
||||
|
||||
// TODO:: this interface is misleading, s0, s1 are actually int8x4
|
||||
// need to make a better interface
|
||||
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
|
||||
{
|
||||
#if DEVICE_BACKEND_CUDA
|
||||
d = __dp4a(s0, s1, d);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -23,17 +23,7 @@ struct vector_type<float, 1>
|
||||
template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float MemoryType __attribute__((ext_vector_type(2)));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
// For some reason, CUDA need this definition, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
// instruction,
|
||||
using MemoryType = float2;
|
||||
#endif
|
||||
using MemoryType = float2_t;
|
||||
|
||||
union Data
|
||||
{
|
||||
@@ -60,17 +50,7 @@ struct vector_type<float, 2>
|
||||
template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
#if DEVICE_BACKEND_HIP
|
||||
// For some reason, HIP compiler need this definition to generate optimal load and store
|
||||
// instruction
|
||||
typedef float MemoryType __attribute__((ext_vector_type(4)));
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
// For some reason, CUDA need this definition, otherwise
|
||||
// compiler won't generate optimal load and store instruction, and
|
||||
// kernel would produce wrong result, indicating the compiler fail to generate correct
|
||||
// instruction,
|
||||
using MemoryType = float4;
|
||||
#endif
|
||||
using MemoryType = float4_t;
|
||||
|
||||
template <index_t I>
|
||||
__host__ __device__ static void SetScalar(MemoryType& v, float s, Number<I>)
|
||||
@@ -204,48 +184,3 @@ struct vector_type<char4, 2>
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
#endif
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
|
||||
{
|
||||
d += s0 * s1;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
d += s0.z * s1.z;
|
||||
d += s0.w * s1.w;
|
||||
}
|
||||
|
||||
#if 0
|
||||
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1) { d += s0 * s1; }
|
||||
|
||||
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x + s0.y * s1.y;
|
||||
}
|
||||
|
||||
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1) { d += s0 * s1; }
|
||||
|
||||
// TODO:: this interface is misleading, s0, s1 are actually int8x4
|
||||
// need to make a better interface
|
||||
__device__ void fused_multiply_accumulate(int32_t& d, const int32_t& s0, const int32_t& s1)
|
||||
{
|
||||
#if DEVICE_BACKEND_CUDA
|
||||
d = __dp4a(s0, s1, d);
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user