refactor

2026-05-12 01:10:17 +00:00 · 2019-09-17 11:19:15 -05:00
parent f7be86b9e4
commit e1a67b693e
5 changed files with 17 additions and 15 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp
@@ -51,7 +51,7 @@ template <index_t GridSize,
          index_t WeiBlockCopyDstDataPerWrite_K>
 struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded
 {
-#if 0
+#if 1
    __device__ void Run(const Float* const __restrict__ p_in_global,
                        const Float* const __restrict__ p_wei_global,
                        Float* const __restrict__ p_out_global) const
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -187,16 +187,20 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
        // weight tensor
        //     tensor descriptor in device memory, src of blockwise copy
        constexpr auto wei_e_k_global_desc =
+#if 0
            transform_tensor_descriptor(wei_k_c_y_x_global_desc,
                                        make_tuple(Merge<Sequence<C, Y, X>>{}, PassThrough<K>{}),
                                        make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}),
                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+#else // hack
+            make_native_tensor_descriptor_packed(Sequence<K, C * Y * X>{});
+#endif

-        //     tensor descriptor in LDS, dst of blockwise copy
-        //     be careful of LDS alignment
-        constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
-            Sequence<EPerBlock, KPerBlock>{},
-            Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});
+            //     tensor descriptor in LDS, dst of blockwise copy
+            //     be careful of LDS alignment
+            constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned(
+                Sequence<EPerBlock, KPerBlock>{},
+                Number<math::lcm(WeiBlockCopyDstDataPerWrite_K, GemmDataPerReadA)>{});

        // operator for blockwise copy of weight into LDS
        //     slice a tensor, and copy it into another tensor
--- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp
@@ -47,7 +47,7 @@ struct NativeTensorCoordinate
        // mIndex is updated here, but some (or all) of its entries may never be used
        mIndex += idx_diff;

-        mOffset += tensor_desc_type::CalculateOffset(idx_diff);
+        mOffset += tensor_desc_type::CalculateOffsetDiff(idx_diff);

        return *this;
    }
@@ -57,7 +57,7 @@ struct NativeTensorCoordinate
        // mIndex is updated here, but some (or all) of its entries may never be used
        mIndex -= idx_diff;

-        mOffset -= tensor_desc_type::CalculateOffset(idx_diff);
+        mOffset -= tensor_desc_type::CalculateOffsetDiff(idx_diff);

        return *this;
    }
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -684,12 +684,10 @@ template <index_t BlockSize,
 struct BlockwiseGenericTensorSliceCopy_v4
 {
    static constexpr index_t nDim = SrcDesc::GetNumOfDimension();
+    using Index                   = MultiIndex<nDim>;

-    using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
-    using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;
-
-    __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(SrcCoord src_block_slice_origin,
-                                                            DstCoord dst_block_slice_origin)
+    __device__ constexpr BlockwiseGenericTensorSliceCopy_v4(const Index& src_block_slice_origin,
+                                                            const Index& dst_block_slice_origin)
    {
        static_assert(nDim == SrcDesc::GetNumOfDimension() &&
                          nDim == DstDesc::GetNumOfDimension() && nDim == SliceLengths::Size() &&
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -966,8 +966,8 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
    using SrcCoord = typename TensorCoordinate_v2<SrcDesc>::type;
    using DstCoord = typename TensorCoordinate_v2<DstDesc>::type;

-    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(SrcCoord src_slice_origin,
-                                                               DstCoord dst_slice_origin)
+    __device__ constexpr ThreadwiseGenericTensorSliceCopy_v4r2(const Index& src_slice_origin,
+                                                               const Index& dst_slice_origin)
        : mSrcSliceOrigin(src_slice_origin), mDstSliceOrigin(dst_slice_origin)
    {
        static_assert(nDim == SrcDesc::GetNumOfDimension() &&