add lds doble buffer to nchw padded v4r1 and v4r4

[ROCm/composable_kernel commit: bf97542846]
This commit is contained in:
Chao Liu
2019-09-15 16:58:16 -05:00
parent e6285a3b55
commit a4edaf2ae3
7 changed files with 153 additions and 135 deletions

View File

@@ -59,7 +59,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I5 = Number<5>{};
constexpr auto True = integral_constant<bool, true>{};
@@ -330,7 +329,6 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded
constexpr auto I1 = Number<1>{};
constexpr auto I2 = Number<2>{};
constexpr auto I3 = Number<3>{};
constexpr auto I5 = Number<5>{};
constexpr auto True = integral_constant<bool, true>{};

View File

@@ -25,14 +25,14 @@ namespace ck {
// repeat-length on the merged dimension need to be 1. These sanity checks are performed
// in constructor of BlockwiseGenericTensorSliceCopy_v1
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
@@ -204,7 +204,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
return GetRegisterBufferDescriptor().GetElementSpace();
}
template <class TData>
template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* __restrict__ p_src,
TData* __restrict__ p_buffer) const
{
@@ -260,7 +260,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
template <class TData>
template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* __restrict__ p_buffer,
TData* __restrict__ p_dst) const
{
@@ -315,7 +315,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
template <class TData>
template <typename TData>
__device__ void Run(const TData* __restrict__ p_src, TData* __restrict__ p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
@@ -406,7 +406,7 @@ struct BlockwiseGenericTensorSliceCopy_v1
});
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
@@ -423,14 +423,14 @@ struct BlockwiseGenericTensorSliceCopy_v1
// Slice a (normal or merged) tensor, and copy it into another (normal or merged) tensor
// memory layout (ordering of dimensions) can be different between src and dst.
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
@@ -482,19 +482,19 @@ struct BlockwiseGenericTensorSliceCopy_v2
return RegisterBufferDesc::GetElementSpace();
}
template <class TData>
template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{
mThreadwiseLoad.Run(p_src, p_buffer);
}
template <class TData>
template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{
mThreadwiseStore.Run(p_buffer, p_dst);
}
template <class TData>
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
@@ -503,14 +503,14 @@ struct BlockwiseGenericTensorSliceCopy_v2
mThreadwiseStore.Run(p_buffer, p_dst);
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
@@ -546,14 +546,14 @@ struct BlockwiseGenericTensorSliceCopy_v2
// this version use TensorView and TensorCoordinate
template <index_t BlockSize,
class SrcTensor,
class DstTensor,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
typename SrcTensor,
typename DstTensor,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
@@ -622,14 +622,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
mThreadwiseStore.Run();
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
{
@@ -669,14 +669,14 @@ struct BlockwiseGenericTensorSliceCopy_v3
};
template <index_t BlockSize,
class SrcDesc,
class DstDesc,
class SliceLengths,
class SubLengths,
class ThreadClusterLengths,
class ThreadClusterArrangeOrder,
class SrcDimAccessOrder,
class DstDimAccessOrder,
typename SrcDesc,
typename DstDesc,
typename SliceLengths,
typename SubLengths,
typename ThreadClusterLengths,
typename ThreadClusterArrangeOrder,
typename SrcDimAccessOrder,
typename DstDimAccessOrder,
index_t SrcVectorAccessDim,
index_t DstVectorAccessDim,
index_t SrcDataPerAccess,
@@ -727,19 +727,19 @@ struct BlockwiseGenericTensorSliceCopy_v4
return RegisterBufferDesc::GetElementSpace();
}
template <class TData>
template <typename TData>
__device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const
{
mThreadwiseLoad.Run(p_src, p_buffer);
}
template <class TData>
template <typename TData>
__device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const
{
mThreadwiseStore.Run(p_buffer, p_dst);
}
template <class TData>
template <typename TData>
__device__ void Run(const TData* p_src, TData* p_dst) const
{
TData p_buffer[GetRegisterBufferSize()];
@@ -748,16 +748,18 @@ struct BlockwiseGenericTensorSliceCopy_v4
mThreadwiseStore.Run(p_buffer, p_dst);
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
MoveSrcSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseLoad.MoveSrcSliceWindow(step_sizes, positive_direction);
}
template <class T, bool PositiveDirection>
template <typename T, bool PositiveDirection>
__device__ void
MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection> positive_direction)
MoveDstSliceWindow(const T& step_sizes,
integral_constant<bool, PositiveDirection> positive_direction)
{
mThreadwiseStore.MoveDstSliceWindow(step_sizes, positive_direction);
}

View File

@@ -1072,16 +1072,22 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
}
template <class T, bool PositiveDirection>
__device__ void MoveSrcSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
__device__ void MoveSrcSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) {
mSrcSliceOrigin += step_sizes;
mSrcSliceOrigin += to_array(step_sizes);
}).Else([&](auto) { mSrcSliceOrigin -= step_sizes; });
}
template <class T, bool PositiveDirection>
__device__ void MoveDstSliceWindow(T step_sizes, integral_constant<bool, PositiveDirection>)
__device__ void MoveDstSliceWindow(const T& step_sizes_,
integral_constant<bool, PositiveDirection>)
{
const auto step_sizes = to_array(step_sizes_);
static_if<PositiveDirection>{}([&](auto) {
mDstSliceOrigin += step_sizes;
}).Else([&](auto) { mDstSliceOrigin -= step_sizes; });