mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 19:09:59 +00:00
adding int8 direct that reads pre-vectorized data
[ROCm/composable_kernel commit: 050a1a6890]
This commit is contained in:
@@ -10,16 +10,6 @@ namespace CUDA {
|
||||
using half = CUDA::half;
|
||||
using half2 = CUDA::half2;
|
||||
|
||||
struct half4
|
||||
{
|
||||
half data[4];
|
||||
};
|
||||
|
||||
struct half8
|
||||
{
|
||||
half data[8];
|
||||
};
|
||||
|
||||
template <class T, unsigned N>
|
||||
struct vector_type
|
||||
{
|
||||
@@ -119,39 +109,141 @@ struct vector_type<half2, 4>
|
||||
using MemoryType = float4;
|
||||
};
|
||||
|
||||
template <class TDst, class TSrc0, class TSrc1, class TSrc2>
|
||||
__device__ void fused_multiply_add(TDst& d, TSrc0 s0, TSrc1 s1, TSrc2 s2)
|
||||
template <>
|
||||
struct vector_type<char, 1>
|
||||
{
|
||||
using MemoryType = char;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(char s) { return s; }
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char, 2>
|
||||
{
|
||||
using MemoryType = char2;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(char s0, char s1)
|
||||
{
|
||||
union
|
||||
{
|
||||
MemoryType vector;
|
||||
char scalar[2];
|
||||
} data;
|
||||
|
||||
data.scalar[0] = s0;
|
||||
data.scalar[1] = s1;
|
||||
return data.vector;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char, 4>
|
||||
{
|
||||
using MemoryType = char4;
|
||||
|
||||
__host__ __device__ static MemoryType Pack(char s0, char s1, char s2, char s3)
|
||||
{
|
||||
union
|
||||
{
|
||||
MemoryType vector;
|
||||
char scalar[4];
|
||||
} data;
|
||||
|
||||
data.scalar[0] = s0;
|
||||
data.scalar[1] = s1;
|
||||
data.scalar[2] = s2;
|
||||
data.scalar[3] = s3;
|
||||
return data.vector;
|
||||
}
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char, 8>
|
||||
{
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char2, 2>
|
||||
{
|
||||
using MemoryType = char4;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char2, 4>
|
||||
{
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<char4, 2>
|
||||
{
|
||||
using MemoryType = int64_t;
|
||||
};
|
||||
|
||||
template <class TDst, class TSrc0, class TSrc1>
|
||||
__device__ void fused_multiply_accumulate(TDst& d, const TSrc0& s0, const TSrc1& s1)
|
||||
{
|
||||
// static_assert(false, "should not call into base");
|
||||
printf("should not call into base");
|
||||
assert(false);
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void fused_multiply_add(float& d, float s0, float s1, float s2)
|
||||
__device__ void fused_multiply_accumulate(float& d, const float& s0, const float& s1)
|
||||
{
|
||||
d = s0 * s1 + s2;
|
||||
d += s0 * s1;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void fused_multiply_add(float& d, float2 s0, float2 s1, float s2)
|
||||
__device__ void fused_multiply_accumulate(float& d, const float2& s0, const float2& s1)
|
||||
{
|
||||
d = s0.x * s1.x + s0.y * s1.y + s2;
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void fused_multiply_add(float& d, float4 s0, float4 s1, float s2)
|
||||
__device__ void fused_multiply_accumulate(float& d, const float4& s0, const float4& s1)
|
||||
{
|
||||
d = s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w + s2;
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
d += s0.z * s1.z;
|
||||
d += s0.w * s1.w;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void fused_multiply_add(half& d, half s0, half s1, half s2)
|
||||
__device__ void fused_multiply_accumulate(half& d, const half& s0, const half& s1)
|
||||
{
|
||||
d = s0 * s1 + s2;
|
||||
d += s0 * s1;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void fused_multiply_add(half& d, half2 s0, half2 s1, half s2)
|
||||
__device__ void fused_multiply_accumulate(half& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d = s0.x * s1.x + s0.y * s1.y + s2;
|
||||
}
|
||||
d += s0.x * s1.x;
|
||||
d += s0.y * s1.y;
|
||||
}
|
||||
|
||||
#if 0
|
||||
template <>
|
||||
__device__ void fused_multiply_accumulate(float& d, const half2& s0, const half2& s1)
|
||||
{
|
||||
d += s0.x * s1.x + s0.y * s1.y;
|
||||
}
|
||||
#endif
|
||||
|
||||
template <>
|
||||
__device__ void fused_multiply_accumulate(char& d, const char& s0, const char& s1)
|
||||
{
|
||||
d += s0 * s1;
|
||||
}
|
||||
|
||||
template <>
|
||||
__device__ void fused_multiply_accumulate(int32_t& d, const char4& s0, const char4& s1)
|
||||
{
|
||||
#if DEVICE_BACKEND_CUDA
|
||||
d = __dp4a(s0, s1, d);
|
||||
#else
|
||||
d += s0.x * s1.x + s0.y * s1.y + s0.z * s1.z + s0.w * s1.w;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -7,7 +7,9 @@
|
||||
#include "threadwise_4d_tensor_op.hip.hpp"
|
||||
#include "threadwise_direct_convolution.hip.hpp"
|
||||
|
||||
template <class Float,
|
||||
template <class TInWei,
|
||||
class TOut,
|
||||
class TAccum,
|
||||
class InGlobalDesc,
|
||||
class WeiGlobalDesc,
|
||||
class OutGlobalDesc,
|
||||
@@ -27,14 +29,16 @@ template <class Float,
|
||||
unsigned BlockSize,
|
||||
unsigned GridSize>
|
||||
__global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
const typename vector_type<Float,
|
||||
const typename vector_type<TInWei,
|
||||
ScalarPerVector>::MemoryType* const __restrict__ p_in_vec_global,
|
||||
const typename vector_type<Float,
|
||||
const typename vector_type<TInWei,
|
||||
ScalarPerVector>::MemoryType* const __restrict__ p_wei_vec_global,
|
||||
Float* const __restrict__ p_out_global)
|
||||
TOut* const __restrict__ p_out_global)
|
||||
{
|
||||
using scalar_t = Float;
|
||||
using vector_mem_t = typename vector_type<scalar_t, ScalarPerVector>::MemoryType;
|
||||
using in_scalar_t = TInWei;
|
||||
using in_vector_mem_t = typename vector_type<in_scalar_t, ScalarPerVector>::MemoryType;
|
||||
using out_scalar_t = TOut;
|
||||
using accum_t = TAccum;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
@@ -79,9 +83,9 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
? InBlockCopyDataPerRead
|
||||
: WeiBlockCopyDataPerRead;
|
||||
|
||||
__shared__ vector_mem_t
|
||||
__shared__ in_vector_mem_t
|
||||
p_in_vec_block[max_align * ((in_block_size + max_align - 1) / max_align)];
|
||||
__shared__ vector_mem_t
|
||||
__shared__ in_vector_mem_t
|
||||
p_wei_vec_block[max_align * ((wei_block_size + max_align - 1) / max_align)];
|
||||
|
||||
// threadwise tensors
|
||||
@@ -99,7 +103,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
in_nchw_vec_thread_block_desc, wei_kcyx_vec_thread_block_desc);
|
||||
|
||||
// register
|
||||
scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
|
||||
out_scalar_t p_out_thread[out_nkhw_thread_desc.GetElementSpace()];
|
||||
|
||||
// divide block work
|
||||
constexpr unsigned NBlockWork =
|
||||
@@ -155,7 +159,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
|
||||
constexpr auto blockwise_in_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
vector_mem_t,
|
||||
in_vector_mem_t,
|
||||
decltype(in_nchw_vec_global_desc),
|
||||
decltype(in_nchw_vec_block_desc),
|
||||
decltype(in_nchw_vec_block_desc.GetLengths()),
|
||||
@@ -164,7 +168,7 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
#if 0
|
||||
constexpr auto blockwise_wei_copy =
|
||||
Blockwise4dTensorCopy1<BlockSize,
|
||||
vector_mem_t,
|
||||
in_vector_mem_t,
|
||||
decltype(wei_kcyx_vec_global_desc),
|
||||
decltype(wei_kcyx_vec_block_desc),
|
||||
decltype(wei_kcyx_vec_block_desc.GetLengths()),
|
||||
@@ -172,15 +176,17 @@ __global__ void gridwise_direct_convolution_2_vectorized_nchw_kcyx_nkhw(
|
||||
#elif 1
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy3<BlockSize,
|
||||
vector_mem_t,
|
||||
in_vector_mem_t,
|
||||
decltype(wei_ke_vec_global_desc),
|
||||
decltype(wei_ke_vec_block_desc),
|
||||
decltype(wei_ke_vec_block_desc.GetLengths()),
|
||||
WeiBlockCopyDataPerRead>{};
|
||||
#endif
|
||||
|
||||
#if 1 // debug
|
||||
// set threadwise output tensor to 0
|
||||
threadwise_4d_tensor_set_zero(out_nkhw_thread_desc, p_out_thread);
|
||||
#endif
|
||||
|
||||
for(unsigned c_block_data_begin = 0; c_block_data_begin < C;
|
||||
c_block_data_begin += CPerBlock, __syncthreads())
|
||||
|
||||
@@ -37,7 +37,8 @@ __device__ void threadwise_4d_tensor_pointwise_operation_unary(Desc, Float* __re
|
||||
|
||||
// TODO: in order to optimize mem access for different mem type,
|
||||
// need to write specialized version
|
||||
template <class Float,
|
||||
template <class SrcData,
|
||||
class DstData,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SrcOpLengths,
|
||||
@@ -45,9 +46,9 @@ template <class Float,
|
||||
class F>
|
||||
__device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
|
||||
SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
const SrcData* __restrict__ p_src,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
DstData* __restrict__ p_dst,
|
||||
SrcOpLengths,
|
||||
DstFromSrcReorder,
|
||||
F f)
|
||||
@@ -88,33 +89,38 @@ __device__ void threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_d
|
||||
}
|
||||
}
|
||||
|
||||
template <class Float, class Desc>
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, Float* __restrict__ p)
|
||||
template <class Data, class Desc>
|
||||
__device__ void threadwise_4d_tensor_set_zero(Desc, Data* __restrict__ p)
|
||||
{
|
||||
auto f_set_zero = [](Float& v) { v = Float(0); };
|
||||
auto f_set_zero = [](Data& v) { v = Data(0); };
|
||||
|
||||
threadwise_4d_tensor_pointwise_operation_unary<Float, Desc, decltype(f_set_zero)>(
|
||||
threadwise_4d_tensor_pointwise_operation_unary<Data, Desc, decltype(f_set_zero)>(
|
||||
Desc{}, p, f_set_zero);
|
||||
}
|
||||
|
||||
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths, class DstFromSrcReorder>
|
||||
template <class SrcData,
|
||||
class DstData,
|
||||
class SrcDesc,
|
||||
class DstDesc,
|
||||
class SrcOpLengths,
|
||||
class DstFromSrcReorder>
|
||||
__device__ void
|
||||
threadwise_4d_tensor_copy_reorder_by_get_dst_from_src(SrcDesc,
|
||||
const Float* __restrict__ p_src,
|
||||
const SrcData* __restrict__ p_src,
|
||||
DstDesc,
|
||||
Float* __restrict__ p_dst,
|
||||
DstData* __restrict__ p_dst,
|
||||
SrcOpLengths,
|
||||
DstFromSrcReorder)
|
||||
{
|
||||
auto f_copy = [](const Float& src, Float& dst) { dst = src; };
|
||||
auto f_copy = [](const SrcData& src, DstData& dst) { dst = static_cast<DstData>(src); };
|
||||
|
||||
threadwise_4d_tensor_pointwise_operation_binary_reorder_by_get_dst_from_src(
|
||||
SrcDesc{}, p_src, DstDesc{}, p_dst, SrcOpLengths{}, DstFromSrcReorder{}, f_copy);
|
||||
}
|
||||
|
||||
template <class Float, class SrcDesc, class DstDesc, class SrcOpLengths>
|
||||
template <class SrcData, class DstData, class SrcDesc, class DstDesc, class SrcOpLengths>
|
||||
__device__ void threadwise_4d_tensor_copy(
|
||||
SrcDesc, const Float* __restrict__ p_src, DstDesc, Float* __restrict__ p_dst, SrcOpLengths)
|
||||
SrcDesc, const SrcData* __restrict__ p_src, DstDesc, DstData* __restrict__ p_dst, SrcOpLengths)
|
||||
{
|
||||
auto dst_from_src_reorder = Sequence<0, 1, 2, 3>{};
|
||||
|
||||
|
||||
@@ -51,10 +51,8 @@ __device__ void threadwise_direct_convolution_1(InDesc,
|
||||
|
||||
const unsigned out_index = out_desc.Get1dIndex(n, k, ho, wo);
|
||||
|
||||
fused_multiply_add(p_out[out_index],
|
||||
p_wei[wei_index],
|
||||
p_in[in_index],
|
||||
p_out[out_index]);
|
||||
fused_multiply_accumulate(
|
||||
p_out[out_index], p_wei[wei_index], p_in[in_index]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user