mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 17:26:00 +00:00
make innner product compatiable on gfx900
This commit is contained in:
@@ -97,8 +97,7 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
|
|||||||
CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
|
CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
|
||||||
c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
|
c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
|
||||||
|
|
||||||
amd_inner_product_dlop<FloatA, FloatB, FloatC>(
|
inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
|
||||||
a_buf[Number<a_offset>{}],
|
|
||||||
b_buf[Number<b_offset>{}],
|
b_buf[Number<b_offset>{}],
|
||||||
c_buf(Number<c_offset>{}));
|
c_buf(Number<c_offset>{}));
|
||||||
});
|
});
|
||||||
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
|
|||||||
CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
|
CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
|
||||||
c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
|
c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
|
||||||
|
|
||||||
amd_inner_product_dlop<a_vector_t, b_vector_t, FloatC>(
|
inner_product<a_vector_t, b_vector_t, FloatC>(
|
||||||
a_vec.template AsType<a_vector_t>()[I0],
|
a_vec.template AsType<a_vector_t>()[I0],
|
||||||
b_vec.template AsType<b_vector_t>()[I0],
|
b_vec.template AsType<b_vector_t>()[I0],
|
||||||
c_buf(Number<c_offset>{}));
|
c_buf(Number<c_offset>{}));
|
||||||
|
|||||||
@@ -1,188 +0,0 @@
|
|||||||
#ifndef CK_AMD_DLOP_HPP
|
|
||||||
#define CK_AMD_DLOP_HPP
|
|
||||||
|
|
||||||
#include "data_type.hpp"
|
|
||||||
|
|
||||||
namespace ck {
|
|
||||||
|
|
||||||
template <typename TA, typename TB, typename TC>
|
|
||||||
__device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c);
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void
|
|
||||||
amd_inner_product_dlop<float, float, float>(const float& a, const float& b, float& c)
|
|
||||||
{
|
|
||||||
#if CK_USE_AMD_DLOP_INLINE_ASM
|
|
||||||
asm volatile("\n \
|
|
||||||
v_fmac_f32 %0, %1, %2 \n \
|
|
||||||
"
|
|
||||||
: "=v"(c)
|
|
||||||
: "v"(a), "v"(b), "0"(c));
|
|
||||||
#else
|
|
||||||
c += a * b;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void
|
|
||||||
amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
|
|
||||||
{
|
|
||||||
constexpr auto I0 = Number<0>{};
|
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
|
|
||||||
vector_type<float, 2>{b}.AsType<float>()[I0],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
|
|
||||||
vector_type<float, 2>{b}.AsType<float>()[I1],
|
|
||||||
c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void
|
|
||||||
amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
|
|
||||||
{
|
|
||||||
constexpr auto I0 = Number<0>{};
|
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
constexpr auto I2 = Number<2>{};
|
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
|
|
||||||
vector_type<float, 4>{b}.AsType<float>()[I0],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
|
|
||||||
vector_type<float, 4>{b}.AsType<float>()[I1],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
|
|
||||||
vector_type<float, 4>{b}.AsType<float>()[I2],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
|
|
||||||
vector_type<float, 4>{b}.AsType<float>()[I3],
|
|
||||||
c);
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CK_USE_AMD_DLOP
|
|
||||||
template <>
|
|
||||||
__device__ void
|
|
||||||
amd_inner_product_dlop<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
|
|
||||||
{
|
|
||||||
#if CK_USE_AMD_DLOP_INLINE_ASM
|
|
||||||
asm volatile("\n \
|
|
||||||
v_dot2_f32_f16 %0, %1, %2, %0\n \
|
|
||||||
"
|
|
||||||
: "=v"(c)
|
|
||||||
: "v"(a), "v"(b), "0"(c));
|
|
||||||
#else
|
|
||||||
c = __builtin_amdgcn_sdot2(a, b, c, false);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void
|
|
||||||
amd_inner_product_dlop<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
|
|
||||||
{
|
|
||||||
constexpr auto I0 = Number<0>{};
|
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
|
|
||||||
vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
|
|
||||||
vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
|
|
||||||
c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void
|
|
||||||
amd_inner_product_dlop<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
|
|
||||||
{
|
|
||||||
constexpr auto I0 = Number<0>{};
|
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
constexpr auto I2 = Number<2>{};
|
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
|
|
||||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
|
|
||||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
|
|
||||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
|
|
||||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
|
|
||||||
c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void amd_inner_product_dlop<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a,
|
|
||||||
const int8x4_t& b,
|
|
||||||
int32_t& c)
|
|
||||||
{
|
|
||||||
#if CK_USE_AMD_DLOP_INLINE_ASM
|
|
||||||
asm volatile("\n \
|
|
||||||
v_dot4_i32_i8 %0, %1, %2, %0\n \
|
|
||||||
"
|
|
||||||
: "=v"(c)
|
|
||||||
: "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
|
|
||||||
#else
|
|
||||||
c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void amd_inner_product_dlop<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a,
|
|
||||||
const int8x8_t& b,
|
|
||||||
int32_t& c)
|
|
||||||
{
|
|
||||||
constexpr auto I0 = Number<0>{};
|
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
|
|
||||||
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
|
|
||||||
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
|
|
||||||
c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <>
|
|
||||||
__device__ void amd_inner_product_dlop<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a,
|
|
||||||
const int8x16_t& b,
|
|
||||||
int32_t& c)
|
|
||||||
{
|
|
||||||
constexpr auto I0 = Number<0>{};
|
|
||||||
constexpr auto I1 = Number<1>{};
|
|
||||||
constexpr auto I2 = Number<2>{};
|
|
||||||
constexpr auto I3 = Number<3>{};
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
|
|
||||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
|
|
||||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
|
|
||||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
|
|
||||||
c);
|
|
||||||
|
|
||||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
|
|
||||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
|
|
||||||
c);
|
|
||||||
}
|
|
||||||
#endif // CK_USE_AMD_DLOP
|
|
||||||
|
|
||||||
} // namespace ck
|
|
||||||
#endif
|
|
||||||
@@ -4,6 +4,8 @@
|
|||||||
#include "data_type.hpp"
|
#include "data_type.hpp"
|
||||||
#include "c_style_pointer_cast.hpp"
|
#include "c_style_pointer_cast.hpp"
|
||||||
|
|
||||||
|
// TODO: deprecate all amd_assembly_outer_product_xxx
|
||||||
|
|
||||||
namespace ck {
|
namespace ck {
|
||||||
|
|
||||||
// c0 += inner_product(a, b0)
|
// c0 += inner_product(a, b0)
|
||||||
|
|||||||
@@ -31,15 +31,13 @@
|
|||||||
#include "static_buffer.hpp"
|
#include "static_buffer.hpp"
|
||||||
#include "dynamic_buffer.hpp"
|
#include "dynamic_buffer.hpp"
|
||||||
|
|
||||||
|
#include "inner_product.hpp"
|
||||||
|
|
||||||
// TODO: remove this
|
// TODO: remove this
|
||||||
#if CK_USE_AMD_INLINE_ASM
|
#if CK_USE_AMD_INLINE_ASM
|
||||||
#include "amd_inline_asm.hpp"
|
#include "amd_inline_asm.hpp"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if CK_USE_AMD_DLOP
|
|
||||||
#include "amd_dlop.hpp"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if CK_USE_AMD_XDLOPS
|
#if CK_USE_AMD_XDLOPS
|
||||||
#include "amd_xdlops.hpp"
|
#include "amd_xdlops.hpp"
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@@ -14,12 +14,7 @@
|
|||||||
// should enable one and only one GPU target
|
// should enable one and only one GPU target
|
||||||
#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
|
#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
|
||||||
defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
|
defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
|
||||||
#error Need to define a single GPU target
|
#error Need to define (only) one GPU target
|
||||||
#endif
|
|
||||||
|
|
||||||
// HIP version
|
|
||||||
#ifndef CK_HIP_VERSION_FLAT
|
|
||||||
#define CK_HIP_VERSION_FLAT 0
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// launch bounds
|
// launch bounds
|
||||||
@@ -38,6 +33,16 @@
|
|||||||
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
|
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
// FMA instruction
|
||||||
|
#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
|
||||||
|
#define CK_USE_AMD_V_MAC_F32
|
||||||
|
#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
|
||||||
|
defined(CK_AMD_GPU_GFX1030)
|
||||||
|
#define CK_USE_AMD_V_FMAC_F32
|
||||||
|
#define CK_USE_AMD_V_DOT2_F32_F16
|
||||||
|
#define CK_USE_AMD_V_DOT4_I32_I8
|
||||||
|
#endif
|
||||||
|
|
||||||
// multi index
|
// multi index
|
||||||
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
|
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
|
||||||
|
|
||||||
@@ -46,13 +51,9 @@
|
|||||||
#define CK_USE_AMD_INLINE_ASM 1
|
#define CK_USE_AMD_INLINE_ASM 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// AMD DLOPS
|
// AMD inner product (DLOP)
|
||||||
#ifndef CK_USE_AMD_DLOP
|
#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
|
||||||
#define CK_USE_AMD_DLOP 1
|
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef CK_USE_AMD_DLOP_INLINE_ASM
|
|
||||||
#define CK_USE_AMD_DLOP_INLINE_ASM 1
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// AMD buffer addressing
|
// AMD buffer addressing
|
||||||
|
|||||||
207
composable_kernel/include/utility/inner_product.hpp
Normal file
207
composable_kernel/include/utility/inner_product.hpp
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
#ifndef CK_INNER_PRODUCT_HPP
|
||||||
|
#define CK_INNER_PRODUCT_HPP
|
||||||
|
|
||||||
|
#include "data_type.hpp"
|
||||||
|
|
||||||
|
namespace ck {
|
||||||
|
|
||||||
|
template <typename TA, typename TB, typename TC>
|
||||||
|
__device__ void inner_product(const TA& a, const TB& b, TC& c);
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
|
||||||
|
{
|
||||||
|
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
|
||||||
|
asm volatile("\n \
|
||||||
|
v_mac_f32 %0, %1, %2 \n \
|
||||||
|
"
|
||||||
|
: "=v"(c)
|
||||||
|
: "v"(a), "v"(b), "0"(c));
|
||||||
|
#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
|
||||||
|
asm volatile("\n \
|
||||||
|
v_fmac_f32 %0, %1, %2 \n \
|
||||||
|
"
|
||||||
|
: "=v"(c)
|
||||||
|
: "v"(a), "v"(b), "0"(c));
|
||||||
|
#else
|
||||||
|
c += a * b;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void
|
||||||
|
inner_product<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
|
||||||
|
{
|
||||||
|
constexpr auto I0 = Number<0>{};
|
||||||
|
constexpr auto I1 = Number<1>{};
|
||||||
|
|
||||||
|
inner_product(vector_type<float, 2>{a}.AsType<float>()[I0],
|
||||||
|
vector_type<float, 2>{b}.AsType<float>()[I0],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<float, 2>{a}.AsType<float>()[I1],
|
||||||
|
vector_type<float, 2>{b}.AsType<float>()[I1],
|
||||||
|
c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void
|
||||||
|
inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
|
||||||
|
{
|
||||||
|
constexpr auto I0 = Number<0>{};
|
||||||
|
constexpr auto I1 = Number<1>{};
|
||||||
|
constexpr auto I2 = Number<2>{};
|
||||||
|
constexpr auto I3 = Number<3>{};
|
||||||
|
|
||||||
|
inner_product(vector_type<float, 4>{a}.AsType<float>()[I0],
|
||||||
|
vector_type<float, 4>{b}.AsType<float>()[I0],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<float, 4>{a}.AsType<float>()[I1],
|
||||||
|
vector_type<float, 4>{b}.AsType<float>()[I1],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<float, 4>{a}.AsType<float>()[I2],
|
||||||
|
vector_type<float, 4>{b}.AsType<float>()[I2],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<float, 4>{a}.AsType<float>()[I3],
|
||||||
|
vector_type<float, 4>{b}.AsType<float>()[I3],
|
||||||
|
c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
|
||||||
|
{
|
||||||
|
#if defined(CK_USE_AMD_V_DOT2_F32_F16)
|
||||||
|
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
|
||||||
|
asm volatile("\n \
|
||||||
|
v_dot2_f32_f16 %0, %1, %2, %0\n \
|
||||||
|
"
|
||||||
|
: "=v"(c)
|
||||||
|
: "v"(a), "v"(b), "0"(c));
|
||||||
|
#else
|
||||||
|
c = __builtin_amdgcn_sdot2(a, b, c, false);
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
const auto convert = type_convert<int32_t>{};
|
||||||
|
|
||||||
|
const vector_type<half_t, 2> a_vector{a};
|
||||||
|
const vector_type<half_t, 2> b_vector{b};
|
||||||
|
|
||||||
|
static_for<0, 2, 1>{}([&](auto i) {
|
||||||
|
c += convert(a_vector.AsType<half_t>()[i]) * convert(b_vector.AsType<half_t>()[i]);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void inner_product<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
|
||||||
|
{
|
||||||
|
constexpr auto I0 = Number<0>{};
|
||||||
|
constexpr auto I1 = Number<1>{};
|
||||||
|
|
||||||
|
inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
|
||||||
|
vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
|
||||||
|
vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
|
||||||
|
c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
|
||||||
|
{
|
||||||
|
constexpr auto I0 = Number<0>{};
|
||||||
|
constexpr auto I1 = Number<1>{};
|
||||||
|
constexpr auto I2 = Number<2>{};
|
||||||
|
constexpr auto I3 = Number<3>{};
|
||||||
|
|
||||||
|
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
|
||||||
|
vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
|
||||||
|
vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
|
||||||
|
vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
|
||||||
|
vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
|
||||||
|
c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void
|
||||||
|
inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
|
||||||
|
{
|
||||||
|
#if defined(CK_USE_DOT4_I32_I8)
|
||||||
|
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
|
||||||
|
asm volatile("\n \
|
||||||
|
v_dot4_i32_i8 %0, %1, %2, %0\n \
|
||||||
|
"
|
||||||
|
: "=v"(c)
|
||||||
|
: "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
|
||||||
|
#else
|
||||||
|
c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
const auto convert = type_convert<int32_t>{};
|
||||||
|
|
||||||
|
const vector_type<int8_t, 4> a_vector{a};
|
||||||
|
const vector_type<int8_t, 4> b_vector{b};
|
||||||
|
|
||||||
|
static_for<0, 4, 1>{}([&](auto i) {
|
||||||
|
c += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
|
||||||
|
});
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void
|
||||||
|
inner_product<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a, const int8x8_t& b, int32_t& c)
|
||||||
|
{
|
||||||
|
constexpr auto I0 = Number<0>{};
|
||||||
|
constexpr auto I1 = Number<1>{};
|
||||||
|
|
||||||
|
inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
|
||||||
|
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
|
||||||
|
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
|
||||||
|
c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <>
|
||||||
|
__device__ void
|
||||||
|
inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t& b, int32_t& c)
|
||||||
|
{
|
||||||
|
constexpr auto I0 = Number<0>{};
|
||||||
|
constexpr auto I1 = Number<1>{};
|
||||||
|
constexpr auto I2 = Number<2>{};
|
||||||
|
constexpr auto I3 = Number<3>{};
|
||||||
|
|
||||||
|
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
|
||||||
|
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
|
||||||
|
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
|
||||||
|
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
|
||||||
|
c);
|
||||||
|
|
||||||
|
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
|
||||||
|
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
|
||||||
|
c);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace ck
|
||||||
|
#endif
|
||||||
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
|
|||||||
const auto wei_desc_k_c_y_x = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
|
const auto wei_desc_k_c_y_x = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
|
||||||
const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
|
const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
|
||||||
|
|
||||||
#if 0
|
#if 1
|
||||||
// [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
|
// [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
|
||||||
// cdata = 64, BlockSize = 256
|
// cdata = 64, BlockSize = 256
|
||||||
constexpr index_t BlockSize = 256;
|
constexpr index_t BlockSize = 256;
|
||||||
|
|||||||
@@ -20,9 +20,9 @@
|
|||||||
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
|
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
|
||||||
|
|
||||||
#define USE_MODE 1
|
#define USE_MODE 1
|
||||||
#define USE_CONV_FWD_V4R4_NCHW 0
|
#define USE_CONV_FWD_V4R4_NCHW 1
|
||||||
#define USE_CONV_FWD_V4R4R2_NHWC 1
|
#define USE_CONV_FWD_V4R4R2_NHWC 0
|
||||||
#define USE_CONV_FWD_V6R1_NCHW 1
|
#define USE_CONV_FWD_V6R1_NCHW 0
|
||||||
#define USE_CONV_FWD_V5R1_NCHW 0
|
#define USE_CONV_FWD_V5R1_NCHW 0
|
||||||
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
|
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
|
||||||
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
|
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
|
||||||
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
|
|||||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if 0
|
#if 1
|
||||||
using in_data_t = float;
|
using in_data_t = float;
|
||||||
using acc_data_t = float;
|
using acc_data_t = float;
|
||||||
using out_data_t = float;
|
using out_data_t = float;
|
||||||
|
|||||||
Reference in New Issue
Block a user