mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 08:50:17 +00:00
refactor
This commit is contained in:
@@ -633,6 +633,7 @@ int main(int argc, char* argv[])
|
||||
|
||||
if(do_verification)
|
||||
{
|
||||
#if 1
|
||||
if(Y == 3 && X == 3)
|
||||
{
|
||||
host_winograd_3x3_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
@@ -642,6 +643,7 @@ int main(int argc, char* argv[])
|
||||
host_direct_convolution(in_nchw, wei_kcsr, out_nkhw_host, lower_pads, upper_pads);
|
||||
}
|
||||
check_error(out_nkhw_host, out_nkhw_device);
|
||||
#endif
|
||||
|
||||
#if 0
|
||||
LogRange(std::cout << "in_nchw : ", in_nchw.mData, ",") << std::endl;
|
||||
|
||||
@@ -373,6 +373,8 @@ template <unsigned BlockSize,
|
||||
unsigned DataPerRead>
|
||||
struct Blockwise2dTensorCopy3
|
||||
{
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::type;
|
||||
|
||||
unsigned mSrcMyThreadOffset;
|
||||
unsigned mDstMyThreadOffset;
|
||||
|
||||
@@ -424,11 +426,6 @@ struct Blockwise2dTensorCopy3
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
|
||||
using Float2 = float2;
|
||||
using Float4 = float4;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
@@ -454,27 +451,9 @@ struct Blockwise2dTensorCopy3
|
||||
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
|
||||
|
||||
auto f_copy = [&](unsigned iloop) {
|
||||
if(DataPerRead == 1)
|
||||
{
|
||||
p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] =
|
||||
p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
|
||||
}
|
||||
else if(DataPerRead == 2)
|
||||
{
|
||||
*(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
|
||||
*(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride));
|
||||
}
|
||||
else if(DataPerRead == 4)
|
||||
{
|
||||
*(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
|
||||
*(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
*(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride));
|
||||
};
|
||||
|
||||
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
|
||||
@@ -514,11 +493,6 @@ struct Blockwise2dTensorCopy3
|
||||
__device__ void RunLoadRegisterClipboard(const Float* __restrict__ p_src,
|
||||
Float* p_clipboard) const
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
|
||||
using Float2 = float2;
|
||||
using Float4 = float4;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
@@ -544,26 +518,9 @@ struct Blockwise2dTensorCopy3
|
||||
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
|
||||
|
||||
auto f_copy = [&](unsigned iloop) {
|
||||
if(DataPerRead == 1)
|
||||
{
|
||||
p_clipboard[iloop] = p_src[mSrcMyThreadOffset + iloop * src_loop_stride];
|
||||
}
|
||||
else if(DataPerRead == 2)
|
||||
{
|
||||
*(reinterpret_cast<Float2*>(p_clipboard + iloop * 2)) =
|
||||
*(reinterpret_cast<const Float2*>(p_src + mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride));
|
||||
}
|
||||
else if(DataPerRead == 4)
|
||||
{
|
||||
*(reinterpret_cast<Float4*>(p_clipboard + iloop * 4)) =
|
||||
*(reinterpret_cast<const Float4*>(p_src + mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
*(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride));
|
||||
};
|
||||
|
||||
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
|
||||
@@ -587,11 +544,6 @@ struct Blockwise2dTensorCopy3
|
||||
__device__ void RunStoreRegisterClipboard(const Float* __restrict__ p_clipboard,
|
||||
Float* __restrict__ p_dst) const
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
|
||||
using Float2 = float2;
|
||||
using Float4 = float4;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
@@ -617,24 +569,8 @@ struct Blockwise2dTensorCopy3
|
||||
constexpr unsigned dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
|
||||
|
||||
auto f_copy = [&](unsigned iloop) {
|
||||
if(DataPerRead == 1)
|
||||
{
|
||||
p_dst[mDstMyThreadOffset + iloop * dst_loop_stride] = p_clipboard[iloop];
|
||||
}
|
||||
else if(DataPerRead == 2)
|
||||
{
|
||||
*(reinterpret_cast<Float2*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
|
||||
*(reinterpret_cast<const Float2*>(p_clipboard + iloop * 2));
|
||||
}
|
||||
else if(DataPerRead == 4)
|
||||
{
|
||||
*(reinterpret_cast<Float4*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
|
||||
*(reinterpret_cast<const Float4*>(p_clipboard + iloop * 4));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
*(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
|
||||
};
|
||||
|
||||
for(unsigned iloop = 0; iloop < nloop_d0; ++iloop)
|
||||
|
||||
@@ -349,6 +349,8 @@ template <unsigned BlockSize,
|
||||
unsigned DataPerRead>
|
||||
struct Blockwise4dTensorCopy3
|
||||
{
|
||||
using vector_t = typename vector_type<Float, DataPerRead>::type;
|
||||
|
||||
unsigned mSrcMyThreadOffset;
|
||||
unsigned mDstMyThreadOffset;
|
||||
|
||||
@@ -422,11 +424,6 @@ struct Blockwise4dTensorCopy3
|
||||
|
||||
__device__ void Run(const Float* __restrict__ p_src, Float* __restrict__ p_dst) const
|
||||
{
|
||||
static_assert(is_same<Float, float>::value, "wrong! only support float!\n");
|
||||
|
||||
using Float2 = float2;
|
||||
using Float4 = float4;
|
||||
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
constexpr auto I2 = Number<2>{};
|
||||
@@ -482,27 +479,9 @@ struct Blockwise4dTensorCopy3
|
||||
iloop_d2 * thread_per_d2,
|
||||
iloop_d3 * thread_per_d3 * DataPerRead);
|
||||
|
||||
if(DataPerRead == 1)
|
||||
{
|
||||
p_dst[dst_offset + mDstMyThreadOffset] =
|
||||
p_src[src_offset + mSrcMyThreadOffset];
|
||||
}
|
||||
else if(DataPerRead == 2)
|
||||
{
|
||||
*(reinterpret_cast<Float2*>(p_dst + dst_offset + mDstMyThreadOffset)) =
|
||||
*(reinterpret_cast<const Float2*>(p_src + src_offset +
|
||||
mSrcMyThreadOffset));
|
||||
}
|
||||
else if(DataPerRead == 4)
|
||||
{
|
||||
*(reinterpret_cast<Float4*>(p_dst + dst_offset + mDstMyThreadOffset)) =
|
||||
*(reinterpret_cast<const Float4*>(p_src + src_offset +
|
||||
mSrcMyThreadOffset));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
*(reinterpret_cast<vector_t*>(p_dst + dst_offset + mDstMyThreadOffset)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + src_offset +
|
||||
mSrcMyThreadOffset));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -16,6 +16,81 @@ struct is_same<T, T>
|
||||
static const bool value = true;
|
||||
};
|
||||
|
||||
template <class T, unsigned N>
|
||||
struct vector_type
|
||||
{
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 1>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
using type = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
using type = float4;
|
||||
};
|
||||
|
||||
#if 0
|
||||
template <>
|
||||
struct vector_type<half_float::half, 1>
|
||||
{
|
||||
using type = half_float::half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half_float::half, 2>
|
||||
{
|
||||
using type = float;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half_float::half, 4>
|
||||
{
|
||||
using type = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half_float::half, 8>
|
||||
{
|
||||
using type = float4;
|
||||
};
|
||||
#endif
|
||||
|
||||
#if 1
|
||||
template <>
|
||||
struct vector_type<half, 1>
|
||||
{
|
||||
using type = half;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 2>
|
||||
{
|
||||
using type = half2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 4>
|
||||
{
|
||||
using type = float2;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<half, 8>
|
||||
{
|
||||
using type = float4;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class T, T N>
|
||||
struct integral_constant
|
||||
{
|
||||
|
||||
@@ -4,8 +4,10 @@
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
#include "hip/hip_runtime.h"
|
||||
#include "half.hpp"
|
||||
#elif DEVICE_BACKEND_CUDA
|
||||
#include "cuda_runtime.h"
|
||||
#include "nvToolsExt.h"
|
||||
#include "helper_cuda.h"
|
||||
#include "cuda_fp16.h"
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user