mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-11 17:00:18 +00:00
clipboard float4 copy and paste C++ code
This commit is contained in:
@@ -238,9 +238,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
constexpr index_t GemmNLevel1Cluster = 4;
|
||||
constexpr index_t GemmKPerThreadLoop = 1;
|
||||
|
||||
constexpr index_t GemmThreadPerColumnPerCluster = 8;
|
||||
constexpr index_t GemmThreadPerRowPerCluster = 8;
|
||||
|
||||
constexpr index_t InBlockCopyThreadPerDim0 = 4;
|
||||
constexpr index_t InBlockCopyThreadPerDim1 = 16;
|
||||
|
||||
@@ -272,7 +269,7 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
for(index_t i = 0; i < nrepeat; ++i)
|
||||
{
|
||||
constexpr auto gridwise_conv =
|
||||
#if 1
|
||||
#if 0
|
||||
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn
|
||||
#else
|
||||
GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
@@ -288,8 +285,6 @@ void device_implicit_gemm_convolution_2_chwn_cyxk_khwn(InDesc,
|
||||
CPerBlock,
|
||||
BPerThread,
|
||||
KPerThread,
|
||||
GemmThreadPerColumnPerCluster,
|
||||
GemmThreadPerRowPerCluster,
|
||||
GemmMPerThreadSubC,
|
||||
GemmNPerThreadSubC,
|
||||
GemmMLevel0Cluster,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
#include "ConstantTensorDescriptor.hip.hpp"
|
||||
#include "inline_asm.hpp"
|
||||
|
||||
template <index_t BlockSize, class Float, class DstDesc, class F>
|
||||
__device__ void
|
||||
@@ -517,9 +518,9 @@ struct Blockwise2dTensorCopy3
|
||||
constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
|
||||
|
||||
auto f_copy = [&](index_t iloop) {
|
||||
*(reinterpret_cast<vector_t*>(p_clipboard + iloop * 4)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_src + mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride));
|
||||
*(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) =
|
||||
*(reinterpret_cast<const vector_t*>(
|
||||
&p_src[mSrcMyThreadOffset + iloop * src_loop_stride]));
|
||||
};
|
||||
|
||||
for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
|
||||
@@ -568,8 +569,8 @@ struct Blockwise2dTensorCopy3
|
||||
constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
|
||||
|
||||
auto f_copy = [&](index_t iloop) {
|
||||
*(reinterpret_cast<vector_t*>(p_dst + mDstMyThreadOffset + iloop * dst_loop_stride)) =
|
||||
*(reinterpret_cast<const vector_t*>(p_clipboard + iloop * 4));
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride])) =
|
||||
*(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]));
|
||||
};
|
||||
|
||||
for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
|
||||
@@ -589,4 +590,124 @@ struct Blockwise2dTensorCopy3
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if DEVICE_BACKEND_HIP
|
||||
__device__ void RunLoadRegisterClipboard_asm(const Float* __restrict__ p_src,
|
||||
Float* p_clipboard) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
constexpr index_t L0 = CopyLengths{}.Get(I0);
|
||||
constexpr index_t L1 = CopyLengths{}.Get(I1);
|
||||
|
||||
constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
|
||||
constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;
|
||||
|
||||
constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;
|
||||
|
||||
if(BlockSize > num_active_thread)
|
||||
{
|
||||
if(get_thread_local_1d_id() >= num_active_thread)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
|
||||
constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
|
||||
constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
|
||||
|
||||
auto f_copy = [&](index_t iloop) {
|
||||
#if 0
|
||||
*(reinterpret_cast<vector_t*>(&p_clipboard[iloop * DataPerRead])) =
|
||||
*(reinterpret_cast<const vector_t*>(&p_src[mSrcMyThreadOffset +
|
||||
iloop * src_loop_stride]));
|
||||
#else
|
||||
static_assert(is_same<float, Float>::value && DataPerRead == 4,
|
||||
"global_load is only for float4");
|
||||
|
||||
global_load(reinterpret_cast<vector_t&>(p_clipboard[iloop * DataPerRead]),
|
||||
reinterpret_cast<const vector_t*>(
|
||||
&p_src[mSrcMyThreadOffset + iloop * src_loop_stride]));
|
||||
#endif
|
||||
};
|
||||
|
||||
for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
|
||||
{
|
||||
f_copy(iloop);
|
||||
}
|
||||
|
||||
constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0);
|
||||
|
||||
if(has_tail_d0)
|
||||
{
|
||||
constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0;
|
||||
|
||||
if(get_thread_local_1d_id() < tail_d0 * thread_per_d1)
|
||||
{
|
||||
f_copy(nloop_d0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void RunStoreRegisterClipboard_asm(const Float* __restrict__ p_clipboard,
|
||||
Float* __restrict__ p_dst) const
|
||||
{
|
||||
constexpr auto I0 = Number<0>{};
|
||||
constexpr auto I1 = Number<1>{};
|
||||
|
||||
constexpr index_t L0 = CopyLengths{}.Get(I0);
|
||||
constexpr index_t L1 = CopyLengths{}.Get(I1);
|
||||
|
||||
constexpr index_t thread_per_d1 = (L1 + DataPerRead - 1) / DataPerRead;
|
||||
constexpr index_t thread_per_d0 = BlockSize / thread_per_d1;
|
||||
|
||||
constexpr index_t num_active_thread = thread_per_d0 * thread_per_d1;
|
||||
|
||||
if(BlockSize > num_active_thread)
|
||||
{
|
||||
if(get_thread_local_1d_id() >= num_active_thread)
|
||||
{
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr index_t nloop_d0 = L0 / thread_per_d0;
|
||||
|
||||
constexpr index_t src_loop_stride = SrcDesc{}.GetStride(I0) * thread_per_d0;
|
||||
constexpr index_t dst_loop_stride = DstDesc{}.GetStride(I0) * thread_per_d0;
|
||||
|
||||
auto f_copy = [&](index_t iloop) {
|
||||
#if 0
|
||||
*(reinterpret_cast<vector_t*>(&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]) =
|
||||
*(reinterpret_cast<const vector_t*>(&p_clipboard[iloop * DataPerRead]);
|
||||
#else
|
||||
static_assert(is_same<float, Float>::value && DataPerRead == 4,
|
||||
"ds_write_b128 is only for float4");
|
||||
|
||||
ds_write_b128(reinterpret_cast<const vector_t&>(p_clipboard[iloop * DataPerRead]),
|
||||
&p_dst[mDstMyThreadOffset + iloop * dst_loop_stride]);
|
||||
#endif
|
||||
};
|
||||
|
||||
for(index_t iloop = 0; iloop < nloop_d0; ++iloop)
|
||||
{
|
||||
f_copy(iloop);
|
||||
}
|
||||
|
||||
constexpr bool has_tail_d0 = (L0 > nloop_d0 * thread_per_d0);
|
||||
|
||||
if(has_tail_d0)
|
||||
{
|
||||
constexpr index_t tail_d0 = L0 - nloop_d0 * thread_per_d0;
|
||||
|
||||
if(get_thread_local_1d_id() < tail_d0 * thread_per_d1)
|
||||
{
|
||||
f_copy(nloop_d0);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
};
|
||||
|
||||
@@ -9,13 +9,13 @@ struct vector_type
|
||||
template <>
|
||||
struct vector_type<float, 1>
|
||||
{
|
||||
using MemoryType = float;
|
||||
typedef float MemoryType;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float, 2>
|
||||
{
|
||||
using MemoryType = float2;
|
||||
typedef float MemoryType __attribute__((ext_vector_type(2)));
|
||||
|
||||
__host__ __device__ static MemoryType Pack(float s0, float s1)
|
||||
{
|
||||
@@ -34,13 +34,7 @@ struct vector_type<float, 2>
|
||||
template <>
|
||||
struct vector_type<float, 4>
|
||||
{
|
||||
using MemoryType = float4;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct vector_type<float2, 2>
|
||||
{
|
||||
using MemoryType = float4;
|
||||
typedef float MemoryType __attribute__((ext_vector_type(4)));
|
||||
};
|
||||
|
||||
#if 0
|
||||
|
||||
@@ -19,8 +19,6 @@ template <index_t GridSize,
|
||||
index_t CPerBlock,
|
||||
index_t BPerThread,
|
||||
index_t KPerThread,
|
||||
index_t GemmThreadPerColumnPerCluster,
|
||||
index_t GemmThreadPerRowPerCluster,
|
||||
index_t GemmMPerThreadSubC,
|
||||
index_t GemmNPerThreadSubC,
|
||||
index_t GemmMLevel0Cluster,
|
||||
@@ -98,34 +96,15 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
constexpr auto out_kb_thread_desc =
|
||||
make_ConstantTensorDescriptor(Sequence<KPerThread, BPerThread>{});
|
||||
|
||||
#if 0
|
||||
if(get_thread_local_1d_id() == 0 && get_block_1d_id() == 0)
|
||||
{
|
||||
print_ConstantTensorDescriptor(in_chwn_global_desc, "in_chwn_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_cyxk_global_desc, "wei_cyxk_global_desc");
|
||||
print_ConstantTensorDescriptor(out_khwn_global_desc, "out_khwn_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_cb_global_desc, "in_cb_global_desc");
|
||||
print_ConstantTensorDescriptor(wei_ek_global_desc, "wei_ek_global_desc");
|
||||
|
||||
print_ConstantTensorDescriptor(in_cb_block_desc, "in_cb_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_cyxk_block_desc, "wei_cyxk_block_desc");
|
||||
print_ConstantTensorDescriptor(wei_ek_block_desc, "wei_ek_block_desc");
|
||||
print_ConstantTensorDescriptor(out_kb_thread_desc, "out_kb_thread_desc");
|
||||
|
||||
printf("KPerBlock %u\n", KPerBlock);
|
||||
}
|
||||
#endif
|
||||
|
||||
// blockwise in copy
|
||||
// formmat is [CPerBlock,BPerBlock + BGhostRead]
|
||||
#if 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(in_cb_global_desc),
|
||||
decltype(in_cb_block_desc),
|
||||
decltype(in_cb_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_in_copy =
|
||||
Blockwise2dTensorCopy2<BlockSize,
|
||||
@@ -148,12 +127,12 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
// blockwise wei copy
|
||||
// format is [CPerBlock*Y*X,KPerBlock]
|
||||
#if 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ek_global_desc),
|
||||
decltype(wei_ek_block_desc),
|
||||
decltype(wei_ek_block_desc.GetLengths())>{};
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy1<BlockSize,
|
||||
Float,
|
||||
decltype(wei_ek_global_desc),
|
||||
decltype(wei_ek_block_desc),
|
||||
decltype(wei_ek_block_desc.GetLengths())>{};
|
||||
#elif 0
|
||||
const auto blockwise_wei_copy =
|
||||
Blockwise2dTensorCopy2<BlockSize,
|
||||
@@ -188,19 +167,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
constexpr auto c_kxb_thread_mtx_desc =
|
||||
make_ConstantMatrixDescriptor(Number<KPerThread>{}, Number<BPerThread>{});
|
||||
|
||||
#if 0
|
||||
const auto blockwise_gemm = BlockwiseGemmBlockABlockBThreadC<BlockSize,
|
||||
decltype(a_cxk_block_mtx_desc),
|
||||
decltype(b_cxb_block_mtx_desc),
|
||||
decltype(c_kxb_thread_mtx_desc),
|
||||
true,
|
||||
false,
|
||||
false,
|
||||
GemmKPerThreadLoop,
|
||||
GemmThreadPerColumnPerCluster,
|
||||
GemmThreadPerRowPerCluster,
|
||||
true>{};
|
||||
#else
|
||||
const auto blockwise_gemm =
|
||||
BlockwiseGemmBlockABlockBThreadCTransANormalBNormalC_v2<BlockSize,
|
||||
decltype(a_cxk_block_mtx_desc),
|
||||
@@ -213,7 +179,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
GemmMLevel1Cluster,
|
||||
GemmNLevel1Cluster,
|
||||
GemmKPerThreadLoop>{};
|
||||
#endif
|
||||
|
||||
// LDS: be careful of alignment
|
||||
constexpr index_t max_align =
|
||||
@@ -235,27 +200,9 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
const Float* p_wei_global_block_offset =
|
||||
p_wei_global + wei_cyxk_global_desc.Get1dIndex(0, 0, 0, k_block_data_begin);
|
||||
|
||||
// preload data into LDS
|
||||
#if 0
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_0);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_0);
|
||||
#else
|
||||
Float4 tmp_in, tmp_wei;
|
||||
Float4* glb_in_p =
|
||||
(Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
|
||||
Float4* glb_wei_p =
|
||||
(Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
|
||||
|
||||
global_load(tmp_in, glb_in_p);
|
||||
global_load(tmp_wei, glb_wei_p);
|
||||
|
||||
Float4* loc_in_p = (Float4*)(p_in_block_double + blockwise_in_copy.mDstMyThreadOffset);
|
||||
Float4* loc_wei_p = (Float4*)(p_wei_block_double + blockwise_wei_copy.mDstMyThreadOffset);
|
||||
|
||||
vmcnt(0);
|
||||
ds_write_b128(tmp_in, loc_in_p);
|
||||
ds_write_b128(tmp_wei, loc_wei_p);
|
||||
#endif
|
||||
// preload data into LDS
|
||||
blockwise_in_copy.Run(p_in_global_block_offset, p_in_block_double);
|
||||
blockwise_wei_copy.Run(p_wei_global_block_offset, p_wei_block_double);
|
||||
|
||||
// register
|
||||
Float p_out_thread[out_kb_thread_desc.GetElementSpace()];
|
||||
@@ -285,18 +232,18 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
|
||||
|
||||
// load next data
|
||||
#if 0
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
#if 1
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
__syncthreads();
|
||||
__syncthreads();
|
||||
|
||||
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
#elif 1
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
#elif 0
|
||||
Float4 tmp_in, tmp_wei;
|
||||
Float4* glb_in_p =
|
||||
(Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
|
||||
@@ -316,7 +263,7 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
for(index_t x = 0; x < X; ++x)
|
||||
{
|
||||
#if 0
|
||||
blockwise_gemm.Run
|
||||
blockwise_gemm.Run
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
@@ -328,11 +275,20 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
}
|
||||
}
|
||||
|
||||
#if 0
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard, p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_next);
|
||||
#elif 1
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(p_wei_register_clipboard,
|
||||
p_wei_block_next);
|
||||
#elif 0
|
||||
// if work with RunLoadRegisterClipboard_asm, need to wait
|
||||
vmcnt(0);
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(p_in_register_clipboard,
|
||||
p_in_block_next);
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard_asm(p_wei_register_clipboard,
|
||||
p_wei_block_next);
|
||||
#elif 0
|
||||
Float4* loc_in_p =
|
||||
(Float4*)(p_in_block_next + blockwise_in_copy.mDstMyThreadOffset);
|
||||
Float4* loc_wei_p =
|
||||
@@ -352,16 +308,16 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
p_in_global_block_offset += CPerBlock * in_cb_global_desc.GetStride(I0);
|
||||
p_wei_global_block_offset += CPerBlock * wei_cyxk_global_desc.GetStride(I0);
|
||||
|
||||
Float4 tmp_in, tmp_wei;
|
||||
Float4* glb_in_p =
|
||||
(Float4*)(p_in_global_block_offset + blockwise_in_copy.mSrcMyThreadOffset);
|
||||
Float4* glb_wei_p =
|
||||
(Float4*)(p_wei_global_block_offset + blockwise_wei_copy.mSrcMyThreadOffset);
|
||||
|
||||
__syncthreads();
|
||||
|
||||
global_load(tmp_in, glb_in_p);
|
||||
global_load(tmp_wei, glb_wei_p);
|
||||
Float p_in_register_clipboard[blockwise_in_copy.GetRegisterClipboardSize()];
|
||||
Float p_wei_register_clipboard[blockwise_wei_copy.GetRegisterClipboardSize()];
|
||||
|
||||
blockwise_in_copy.RunLoadRegisterClipboard(p_in_global_block_offset,
|
||||
p_in_register_clipboard);
|
||||
|
||||
blockwise_wei_copy.RunLoadRegisterClipboard(p_wei_global_block_offset,
|
||||
p_wei_register_clipboard);
|
||||
|
||||
for(index_t y = 0; y < Y; ++y)
|
||||
{
|
||||
@@ -369,10 +325,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
#if 0
|
||||
blockwise_gemm.Run
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
p_in_block_double + y * Wi + x,
|
||||
@@ -380,14 +336,22 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
}
|
||||
}
|
||||
|
||||
Float4* loc_in_p = (Float4*)(p_in_block_double + in_block_element_space +
|
||||
blockwise_in_copy.mDstMyThreadOffset);
|
||||
Float4* loc_wei_p = (Float4*)(p_wei_block_double + wei_block_element_space +
|
||||
blockwise_wei_copy.mDstMyThreadOffset);
|
||||
#if 1
|
||||
blockwise_in_copy.RunStoreRegisterClipboard(p_in_register_clipboard,
|
||||
p_in_block_double + in_block_element_space);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard(
|
||||
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
|
||||
#else
|
||||
// if work with RunLoadRegisterClipboard_asm, need to wait
|
||||
vmcnt(0);
|
||||
ds_write_b128(tmp_in, loc_in_p);
|
||||
ds_write_b128(tmp_wei, loc_wei_p);
|
||||
|
||||
blockwise_in_copy.RunStoreRegisterClipboard_asm(
|
||||
p_in_register_clipboard, p_in_block_double + in_block_element_space);
|
||||
|
||||
blockwise_wei_copy.RunStoreRegisterClipboard_asm(
|
||||
p_wei_register_clipboard, p_wei_block_double + wei_block_element_space);
|
||||
#endif
|
||||
|
||||
// odd
|
||||
__syncthreads();
|
||||
@@ -398,10 +362,10 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
{
|
||||
#if 0
|
||||
blockwise_gemm.Run
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#elif 0
|
||||
blockwise_gemm.Run_RegisterDoubleBuffer
|
||||
#elif 1
|
||||
blockwise_gemm.Run_asm
|
||||
#endif
|
||||
(p_wei_block_double + in_block_element_space +
|
||||
wei_cyxk_block_desc.Get1dIndex(0, y, x, 0),
|
||||
@@ -423,20 +387,6 @@ struct GridwiseConvolutionImplicitGemm_v2_chwn_cyxk_khwn_lds_double_buffer
|
||||
const index_t k_thread_data_begin = k_block_data_begin + c_thread_mtx_begin.row;
|
||||
const index_t b_thread_data_begin = b_block_data_begin + c_thread_mtx_begin.col;
|
||||
|
||||
#if 0
|
||||
if(get_block_1d_id() == 0)
|
||||
{
|
||||
printf("%u %u, row %u col %u, k_data_begin %u b_data_begin %u, %f %f %f %f\n",
|
||||
get_block_1d_id(),
|
||||
get_thread_local_1d_id(),
|
||||
matrix_c_index.row,
|
||||
matrix_c_index.col,
|
||||
k_data_begin,
|
||||
b_data_begin,
|
||||
p_out_thread[0], p_out_thread[1], p_out_thread[2], p_out_thread[3]);
|
||||
}
|
||||
#endif
|
||||
|
||||
for(index_t k = 0; k < out_kb_thread_desc.GetLength(I0); ++k)
|
||||
{
|
||||
for(index_t b = 0; b < out_kb_thread_desc.GetLength(I1); ++b)
|
||||
|
||||
@@ -10,7 +10,7 @@ extern "C" __attribute__((address_space(3))) void* __to_local(void* p)[[hc]];
|
||||
#define NO_DS_WRITE 0
|
||||
#define NO_GLB_READ 0
|
||||
|
||||
inline __device__ void vmcnt(int cnt)
|
||||
inline __device__ void vmcnt(index_t cnt)
|
||||
{
|
||||
#if !NO_VM_WAIT
|
||||
if(cnt == 0)
|
||||
@@ -39,12 +39,12 @@ inline __device__ void vmcnt(int cnt)
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
assert(false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void lgkmcnt(int cnt)
|
||||
inline __device__ void lgkmcnt(index_t cnt)
|
||||
{
|
||||
#if !NO_LGKM_WAIT
|
||||
if(cnt == 0)
|
||||
@@ -79,7 +79,7 @@ inline __device__ void lgkmcnt(int cnt)
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
assert(false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
@@ -187,7 +187,7 @@ inline __device__ void outerProduct8x8(const Float4* a, const Float4* b, Float4*
|
||||
outerProduct4x4(a[1], b[1], c[9], c[11], c[13], c[15]);
|
||||
}
|
||||
|
||||
inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0)
|
||||
inline __device__ void ds_read_b128(Float4& r, void* lds, index_t offset = 0)
|
||||
{
|
||||
#if !NO_DS_READ
|
||||
if(offset == 0)
|
||||
@@ -408,29 +408,43 @@ inline __device__ void ds_read_b128(Float4& r, void* lds, int offset = 0)
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(0);
|
||||
assert(false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void global_load(Float4& r, Float4* ptr)
|
||||
inline __device__ void global_load(Float4& r, const Float4* ptr, index_t offset = 0)
|
||||
{
|
||||
#if !NO_GLB_READ
|
||||
asm volatile("\n \
|
||||
global_load_dwordx4 %0, %1, off \n \
|
||||
"
|
||||
: "=v"(r)
|
||||
: "v"(ptr));
|
||||
if(offset == 0)
|
||||
{
|
||||
asm volatile("\n \
|
||||
global_load_dwordx4 %0, %1, off \n \
|
||||
"
|
||||
: "=v"(r)
|
||||
: "v"(ptr));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
inline __device__ void ds_write_b128(Float4& r, void* lds, int offset = 0)
|
||||
inline __device__ void ds_write_b128(const Float4& r, void* lds, index_t offset = 0)
|
||||
{
|
||||
#if !NO_DS_WRITE
|
||||
asm volatile("\n \
|
||||
if(offset == 0)
|
||||
{
|
||||
asm volatile("\n \
|
||||
ds_write_b128 %0, %1 \n \
|
||||
"
|
||||
:
|
||||
: "v"(__to_local(lds)), "v"(r));
|
||||
:
|
||||
: "v"(__to_local(lds)), "v"(r));
|
||||
}
|
||||
else
|
||||
{
|
||||
assert(false);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -14,12 +14,29 @@ __device__ void threadwise_matrix_copy(SrcMatrix,
|
||||
|
||||
for(index_t i = 0; i < NRow; ++i)
|
||||
{
|
||||
for(index_t j = 0; j < NCol; ++j)
|
||||
// optimize for vector-4 load
|
||||
if(NCol % 4 == 0)
|
||||
{
|
||||
const index_t src_index = src_mtx.Get1dIndex(i, j);
|
||||
const index_t dst_index = dst_mtx.Get1dIndex(i, j);
|
||||
using vector_t = typename vector_type<Float, 4>::MemoryType;
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
for(index_t j = 0; j < NCol / 4; ++j)
|
||||
{
|
||||
const index_t src_index = src_mtx.Get1dIndex(i, 4 * j);
|
||||
const index_t dst_index = dst_mtx.Get1dIndex(i, 4 * j);
|
||||
|
||||
*reinterpret_cast<vector_t*>(&p_dst[dst_index]) =
|
||||
*reinterpret_cast<const vector_t*>(&p_src[src_index]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for(index_t j = 0; j < NCol; ++j)
|
||||
{
|
||||
const index_t src_index = src_mtx.Get1dIndex(i, j);
|
||||
const index_t dst_index = dst_mtx.Get1dIndex(i, j);
|
||||
|
||||
p_dst[dst_index] = p_src[src_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user