mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
make innner product compatiable on gfx900
This commit is contained in:
@@ -97,10 +97,9 @@ struct ThreadwiseGemmDlops_km0m1_kn0n1_m0m1n0n1
|
||||
CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
|
||||
c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
|
||||
|
||||
amd_inner_product_dlop<FloatA, FloatB, FloatC>(
|
||||
a_buf[Number<a_offset>{}],
|
||||
b_buf[Number<b_offset>{}],
|
||||
c_buf(Number<c_offset>{}));
|
||||
inner_product<FloatA, FloatB, FloatC>(a_buf[Number<a_offset>{}],
|
||||
b_buf[Number<b_offset>{}],
|
||||
c_buf(Number<c_offset>{}));
|
||||
});
|
||||
});
|
||||
});
|
||||
@@ -214,7 +213,7 @@ struct ThreadwiseContractionDlops_A_TK0_TM0_TM1_TK1_B_TK0_TN0_TN1_TK1_C_TM0_TM1_
|
||||
CThreadDesc_TM0_TM1_TN0_TN1{}.CalculateOffset(
|
||||
c_origin_idx + make_multi_index(tm0, tm1, tn0, tn1));
|
||||
|
||||
amd_inner_product_dlop<a_vector_t, b_vector_t, FloatC>(
|
||||
inner_product<a_vector_t, b_vector_t, FloatC>(
|
||||
a_vec.template AsType<a_vector_t>()[I0],
|
||||
b_vec.template AsType<b_vector_t>()[I0],
|
||||
c_buf(Number<c_offset>{}));
|
||||
|
||||
@@ -1,188 +0,0 @@
|
||||
#ifndef CK_AMD_DLOP_HPP
|
||||
#define CK_AMD_DLOP_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
__device__ void amd_inner_product_dlop(const TA& a, const TB& b, TC& c);
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
amd_inner_product_dlop<float, float, float>(const float& a, const float& b, float& c)
|
||||
{
|
||||
#if CK_USE_AMD_DLOP_INLINE_ASM
|
||||
asm volatile("\n \
|
||||
v_fmac_f32 %0, %1, %2 \n \
|
||||
"
|
||||
: "=v"(c)
|
||||
: "v"(a), "v"(b), "0"(c));
|
||||
#else
|
||||
c += a * b;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
amd_inner_product_dlop<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I0],
|
||||
vector_type<float, 2>{b}.AsType<float>()[I0],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<float, 2>{a}.AsType<float>()[I1],
|
||||
vector_type<float, 2>{b}.AsType<float>()[I1],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
amd_inner_product_dlop<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I0],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I0],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I1],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I1],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I2],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I2],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<float, 4>{a}.AsType<float>()[I3],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I3],
|
||||
c);
|
||||
}
|
||||
|
||||
#if CK_USE_AMD_DLOP
|
||||
template <>
|
||||
__device__ void
|
||||
amd_inner_product_dlop<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
|
||||
{
|
||||
#if CK_USE_AMD_DLOP_INLINE_ASM
|
||||
asm volatile("\n \
|
||||
v_dot2_f32_f16 %0, %1, %2, %0\n \
|
||||
"
|
||||
: "=v"(c)
|
||||
: "v"(a), "v"(b), "0"(c));
|
||||
#else
|
||||
c = __builtin_amdgcn_sdot2(a, b, c, false);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
amd_inner_product_dlop<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
|
||||
vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
|
||||
vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
amd_inner_product_dlop<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void amd_inner_product_dlop<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a,
|
||||
const int8x4_t& b,
|
||||
int32_t& c)
|
||||
{
|
||||
#if CK_USE_AMD_DLOP_INLINE_ASM
|
||||
asm volatile("\n \
|
||||
v_dot4_i32_i8 %0, %1, %2, %0\n \
|
||||
"
|
||||
: "=v"(c)
|
||||
: "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
|
||||
#else
|
||||
c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void amd_inner_product_dlop<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a,
|
||||
const int8x8_t& b,
|
||||
int32_t& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
|
||||
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
|
||||
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void amd_inner_product_dlop<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a,
|
||||
const int8x16_t& b,
|
||||
int32_t& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
|
||||
c);
|
||||
|
||||
amd_inner_product_dlop(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
|
||||
c);
|
||||
}
|
||||
#endif // CK_USE_AMD_DLOP
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -4,6 +4,8 @@
|
||||
#include "data_type.hpp"
|
||||
#include "c_style_pointer_cast.hpp"
|
||||
|
||||
// TODO: deprecate all amd_assembly_outer_product_xxx
|
||||
|
||||
namespace ck {
|
||||
|
||||
// c0 += inner_product(a, b0)
|
||||
|
||||
@@ -31,15 +31,13 @@
|
||||
#include "static_buffer.hpp"
|
||||
#include "dynamic_buffer.hpp"
|
||||
|
||||
#include "inner_product.hpp"
|
||||
|
||||
// TODO: remove this
|
||||
#if CK_USE_AMD_INLINE_ASM
|
||||
#include "amd_inline_asm.hpp"
|
||||
#endif
|
||||
|
||||
#if CK_USE_AMD_DLOP
|
||||
#include "amd_dlop.hpp"
|
||||
#endif
|
||||
|
||||
#if CK_USE_AMD_XDLOPS
|
||||
#include "amd_xdlops.hpp"
|
||||
#endif
|
||||
|
||||
@@ -14,12 +14,7 @@
|
||||
// should enable one and only one GPU target
|
||||
#if !(defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900) || defined(CK_AMD_GPU_GFX906) || \
|
||||
defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90A) || defined(CK_AMD_GPU_GFX1030))
|
||||
#error Need to define a single GPU target
|
||||
#endif
|
||||
|
||||
// HIP version
|
||||
#ifndef CK_HIP_VERSION_FLAT
|
||||
#define CK_HIP_VERSION_FLAT 0
|
||||
#error Need to define (only) one GPU target
|
||||
#endif
|
||||
|
||||
// launch bounds
|
||||
@@ -38,6 +33,16 @@
|
||||
#define CK_BUFFER_RESOURCE_3RD_DWORD 0x31014000
|
||||
#endif
|
||||
|
||||
// FMA instruction
|
||||
#if defined(CK_AMD_GPU_GFX803) || defined(CK_AMD_GPU_GFX900)
|
||||
#define CK_USE_AMD_V_MAC_F32
|
||||
#elif defined(CK_AMD_GPU_GFX906) || defined(CK_AMD_GPU_GFX908) || defined(CK_AMD_GPU_GFX90a) || \
|
||||
defined(CK_AMD_GPU_GFX1030)
|
||||
#define CK_USE_AMD_V_FMAC_F32
|
||||
#define CK_USE_AMD_V_DOT2_F32_F16
|
||||
#define CK_USE_AMD_V_DOT4_I32_I8
|
||||
#endif
|
||||
|
||||
// multi index
|
||||
#define CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX 0
|
||||
|
||||
@@ -46,13 +51,9 @@
|
||||
#define CK_USE_AMD_INLINE_ASM 1
|
||||
#endif
|
||||
|
||||
// AMD DLOPS
|
||||
#ifndef CK_USE_AMD_DLOP
|
||||
#define CK_USE_AMD_DLOP 1
|
||||
#endif
|
||||
|
||||
#ifndef CK_USE_AMD_DLOP_INLINE_ASM
|
||||
#define CK_USE_AMD_DLOP_INLINE_ASM 1
|
||||
// AMD inner product (DLOP)
|
||||
#ifndef CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
|
||||
#define CK_USE_AMD_INNER_PRODUCT_INLINE_ASM 1
|
||||
#endif
|
||||
|
||||
// AMD buffer addressing
|
||||
|
||||
207
composable_kernel/include/utility/inner_product.hpp
Normal file
207
composable_kernel/include/utility/inner_product.hpp
Normal file
@@ -0,0 +1,207 @@
|
||||
#ifndef CK_INNER_PRODUCT_HPP
|
||||
#define CK_INNER_PRODUCT_HPP
|
||||
|
||||
#include "data_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
|
||||
template <typename TA, typename TB, typename TC>
|
||||
__device__ void inner_product(const TA& a, const TB& b, TC& c);
|
||||
|
||||
template <>
|
||||
__device__ void inner_product<float, float, float>(const float& a, const float& b, float& c)
|
||||
{
|
||||
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_MAC_F32)
|
||||
asm volatile("\n \
|
||||
v_mac_f32 %0, %1, %2 \n \
|
||||
"
|
||||
: "=v"(c)
|
||||
: "v"(a), "v"(b), "0"(c));
|
||||
#elif CK_USE_AMD_INNER_PRODUCT_INLINE_ASM && defined(CK_USE_AMD_V_FMAC_F32)
|
||||
asm volatile("\n \
|
||||
v_fmac_f32 %0, %1, %2 \n \
|
||||
"
|
||||
: "=v"(c)
|
||||
: "v"(a), "v"(b), "0"(c));
|
||||
#else
|
||||
c += a * b;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
inner_product<float2_t, float2_t, float>(const float2_t& a, const float2_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
inner_product(vector_type<float, 2>{a}.AsType<float>()[I0],
|
||||
vector_type<float, 2>{b}.AsType<float>()[I0],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<float, 2>{a}.AsType<float>()[I1],
|
||||
vector_type<float, 2>{b}.AsType<float>()[I1],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
inner_product<float4_t, float4_t, float>(const float4_t& a, const float4_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
inner_product(vector_type<float, 4>{a}.AsType<float>()[I0],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I0],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<float, 4>{a}.AsType<float>()[I1],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I1],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<float, 4>{a}.AsType<float>()[I2],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I2],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<float, 4>{a}.AsType<float>()[I3],
|
||||
vector_type<float, 4>{b}.AsType<float>()[I3],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void inner_product<half2_t, half2_t, float>(const half2_t& a, const half2_t& b, float& c)
|
||||
{
|
||||
#if defined(CK_USE_AMD_V_DOT2_F32_F16)
|
||||
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
|
||||
asm volatile("\n \
|
||||
v_dot2_f32_f16 %0, %1, %2, %0\n \
|
||||
"
|
||||
: "=v"(c)
|
||||
: "v"(a), "v"(b), "0"(c));
|
||||
#else
|
||||
c = __builtin_amdgcn_sdot2(a, b, c, false);
|
||||
#endif
|
||||
#else
|
||||
const auto convert = type_convert<int32_t>{};
|
||||
|
||||
const vector_type<half_t, 2> a_vector{a};
|
||||
const vector_type<half_t, 2> b_vector{b};
|
||||
|
||||
static_for<0, 2, 1>{}([&](auto i) {
|
||||
c += convert(a_vector.AsType<half_t>()[i]) * convert(b_vector.AsType<half_t>()[i]);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void inner_product<half4_t, half4_t, float>(const half4_t& a, const half4_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I0],
|
||||
vector_type<half_t, 4>{b}.AsType<half2_t>()[I0],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<half_t, 4>{a}.AsType<half2_t>()[I1],
|
||||
vector_type<half_t, 4>{b}.AsType<half2_t>()[I1],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void inner_product<half8_t, half8_t, float>(const half8_t& a, const half8_t& b, float& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I0],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I0],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I1],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I1],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I2],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I2],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<half_t, 8>{a}.AsType<half2_t>()[I3],
|
||||
vector_type<half_t, 8>{b}.AsType<half2_t>()[I3],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
inner_product<int8x4_t, int8x4_t, int32_t>(const int8x4_t& a, const int8x4_t& b, int32_t& c)
|
||||
{
|
||||
#if defined(CK_USE_DOT4_I32_I8)
|
||||
#if CK_USE_AMD_INNER_PRODUCT_INLINE_ASM
|
||||
asm volatile("\n \
|
||||
v_dot4_i32_i8 %0, %1, %2, %0\n \
|
||||
"
|
||||
: "=v"(c)
|
||||
: "v"(as_type<int32_t>(a)), "v"(as_type<int32_t>(b)), "0"(c));
|
||||
#else
|
||||
c = __builtin_amdgcn_sdot4(as_type<int32_t>(a), as_type<int32_t>(b), c, false);
|
||||
#endif
|
||||
#else
|
||||
const auto convert = type_convert<int32_t>{};
|
||||
|
||||
const vector_type<int8_t, 4> a_vector{a};
|
||||
const vector_type<int8_t, 4> b_vector{b};
|
||||
|
||||
static_for<0, 4, 1>{}([&](auto i) {
|
||||
c += convert(a_vector.AsType<int8_t>()[i]) * convert(b_vector.AsType<int8_t>()[i]);
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
inner_product<int8x8_t, int8x8_t, int32_t>(const int8x8_t& a, const int8x8_t& b, int32_t& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I0],
|
||||
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I0],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<int8_t, 8>{a}.AsType<int8x4_t>()[I1],
|
||||
vector_type<int8_t, 8>{b}.AsType<int8x4_t>()[I1],
|
||||
c);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void
|
||||
inner_product<int8x16_t, int8x16_t, int32_t>(const int8x16_t& a, const int8x16_t& b, int32_t& c)
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
constexpr auto I3 = Number<3>{};
|
||||
|
||||
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I0],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I0],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I1],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I1],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I2],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I2],
|
||||
c);
|
||||
|
||||
inner_product(vector_type<int8_t, 16>{a}.AsType<int8x4_t>()[I3],
|
||||
vector_type<int8_t, 16>{b}.AsType<int8x4_t>()[I3],
|
||||
c);
|
||||
}
|
||||
|
||||
} // namespace ck
|
||||
#endif
|
||||
@@ -48,7 +48,7 @@ void device_convolution_forward_implicit_gemm_v6r1_dlops_nchw_kcyx_nkhw(
|
||||
const auto wei_desc_k_c_y_x = make_naive_tensor_descriptor_packed(wei_k_c_y_x_lengths);
|
||||
const auto out_desc_n_k_ho_wo = make_naive_tensor_descriptor_packed(out_n_k_ho_wo_lengths);
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
// [8, 1, 128, 1] * [8, 4, 32, 1] = [1, 128, 4, 32] for fp32
|
||||
// cdata = 64, BlockSize = 256
|
||||
constexpr index_t BlockSize = 256;
|
||||
|
||||
@@ -20,9 +20,9 @@
|
||||
#include "device_convolution_forward_implicit_gemm_v4r4r4_xdlops_nhwc_kyxc_nhwk.hpp"
|
||||
|
||||
#define USE_MODE 1
|
||||
#define USE_CONV_FWD_V4R4_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R2_NHWC 1
|
||||
#define USE_CONV_FWD_V6R1_NCHW 1
|
||||
#define USE_CONV_FWD_V4R4_NCHW 1
|
||||
#define USE_CONV_FWD_V4R4R2_NHWC 0
|
||||
#define USE_CONV_FWD_V6R1_NCHW 0
|
||||
#define USE_CONV_FWD_V5R1_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R2_XDL_NCHW 0
|
||||
#define USE_CONV_FWD_V4R4R4_XDL_NHWC 0
|
||||
@@ -126,7 +126,7 @@ int main(int argc, char* argv[])
|
||||
const index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
#if 1
|
||||
using in_data_t = float;
|
||||
using acc_data_t = float;
|
||||
using out_data_t = float;
|
||||
|
||||
Reference in New Issue
Block a user