mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-12 01:10:17 +00:00
refactor
This commit is contained in:
@@ -51,7 +51,7 @@ template <index_t GridSize,
|
||||
index_t WeiBlockCopyDstDataPerWrite_K>
|
||||
struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
|
||||
{
|
||||
#if 0
|
||||
#if 1
|
||||
__device__ void Run(const Float* const __restrict__ p_in_global,
|
||||
const Float* const __restrict__ p_wei_global,
|
||||
Float* const __restrict__ p_out_global) const
|
||||
|
||||
@@ -187,16 +187,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
|
||||
// weight tensor
|
||||
// tensor descriptor in device memory, src of blockwise copy
|
||||
constexpr auto wei_e_k_global_desc =
|
||||
#if 0
|
||||
transform_tensor_descriptor(wei_k_c_y_x_global_desc,
|
||||
make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
|
||||
make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
|
||||
make_tuple(Sequence<0>{}, Sequence<1>{}));
|
||||
#else // hack
|
||||
make_native_tensor_descriptor_packed(Sequence<K, C * Y * X>{});
|
||||
#endif
|
||||
|
||||
// tensor descriptor in LDS, dst of blockwise copy
|
||||
// be careful of LDS alignment
|
||||
constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
|
||||
Sequence<EPerBlock, KPerBlock>{},
|
||||
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
// tensor descriptor in LDS, dst of blockwise copy
|
||||
// be careful of LDS alignment
|
||||
constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
|
||||
Sequence<EPerBlock, KPerBlock>{},
|
||||
Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
|
||||
|
||||
// operator for blockwise copy of weight into LDS
|
||||
// slice a tensor, and copy it into another tensor
|
||||
|
||||
@@ -47,7 +47,7 @@ struct NativeTensorCoordinate
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
mIndex += idx_diff;
|
||||
|
||||
mOffset += tensor_desc_type::CalculateOffset(idx_diff);
|
||||
mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
@@ -57,7 +57,7 @@ struct NativeTensorCoordinate
|
||||
// mIndex is updated here, but some (or all) of its entries may never be used
|
||||
mIndex -= idx_diff;
|
||||
|
||||
mOffset -= tensor_desc_type::CalculateOffset(idx_diff);
|
||||
mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
@@ -684,12 +684,10 @@ template <index_t BlockSize,
|
||||
struct BlockwiseGenericTensorSliceCopy_v4
|
||||
{
|
||||
static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
|
||||
using Index = MultiIndex<nDim>;
|
||||
|
||||
using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
|
||||
using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
|
||||
|
||||
__device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin,
|
||||
DstCoord dst_block_slice_origin)
|
||||
__device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
|
||||
const Index& dst_block_slice_origin)
|
||||
{
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
|
||||
nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
|
||||
|
||||
@@ -966,8 +966,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
|
||||
using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
|
||||
using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
|
||||
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin,
|
||||
DstCoord dst_slice_origin)
|
||||
__device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
|
||||
const Index& dst_slice_origin)
|
||||
: mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
|
||||
{
|
||||
static_assert(nDim == SrcDesc::GetNumOfDimension() &&
|
||||
|
||||
Reference in New Issue
Block a user