tweaking

2026-05-11 17:00:18 +00:00 · 2019-09-27 16:38:11 -05:00
parent 14315b72f3
commit 012d3a071b
8 changed files with 31 additions and 49 deletions
--- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
+++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp
@@ -426,7 +426,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf
                                                      0,
                                                      b_thread_data_on_global,
                                                      0})
-#if 0
+#if 1
                .template Run<Float, Float, address_space_t::generic, address_space_t::global>
 #else // tweaking
                .template Run_optimized_dst_address_calculation<Float,
--- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp
@@ -78,10 +78,12 @@ struct NativeTensorCoordinate
        return coord;
    }

+#if 0 // tweaking
    __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff)
    {
        return tensor_desc_type::CalculateOffsetDiff(idx_diff);
    }
+#endif

    __host__ __device__ static constexpr bool IsUpperIndexMappedToValidOffset() { return true; }

@@ -175,6 +177,7 @@ struct TransformedTensorCoordinate
        return coord_up;
    }

+#if 0 // tweaking
    // Calculate offset diff without updating tensor-coordinate
    // If idx_up_diff is know at compile time, and has only non-zero entries on linear dimensions,
    //   then all calculation can be done at compile-time.
@@ -183,9 +186,12 @@ struct TransformedTensorCoordinate
        // For transformation of multi-index difference, not all transformation functions need to
        //   know the old lower-index or the old upper-index. We pass both of them to the
        //   transformation function. The transformation function itself decides to use them or not.
-        return GetLowerCoordinate().CalculateOffsetDiff(tensor_desc_type::CalculateLowerIndexDiff(
-            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex()));
+        const auto idx_low_diff = tensor_desc_type::CalculateLowerIndexDiff(
+            idx_up_diff, GetIndex(), GetLowerCoordinate().GetIndex());
+
+        return GetLowerCoordinate().CalculateOffsetDiff(idx_low_diff);
    }
+#endif

    __host__ __device__ constexpr bool IsUpperIndexMappedToValidOffset() const
    {
@@ -209,7 +215,7 @@ struct TensorCoordinate
    private:
    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(NativeTensorDescriptor<Ts...>)
    {
        return NativeTensorCoordinate<NativeTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
@@ -217,7 +223,7 @@ struct TensorCoordinate

    template <typename... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(TransformedTensorDescriptor<Ts...>)
    {
        return TransformedTensorCoordinate<TransformedTensorDescriptor<Ts...>>(
            make_zero_array<index_t, TensorDesc::GetNumOfDimension()>());
--- a/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
+++ b/composable_kernel/include/tensor_description/tensor_coordinate_deprecated.hpp
@@ -326,14 +326,14 @@ struct TensorCoordinate_deprecated
    private:
    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantTensorDescriptor<Ts...>)
    {
        return NormalTensorCoordinate_deprecated<ConstantTensorDescriptor<Ts...>>();
    }

    template <class... Ts>
    __host__ __device__ static constexpr auto
-    MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
+        MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor<Ts...>)
    {
        return MergedTensorCoordinate<ConstantMergedTensorDescriptor<Ts...>>();
    }
--- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp
+++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp
@@ -319,7 +319,7 @@ struct TransformedTensorDescriptor
        return idx_low;
    }

-    // TODO: right now return value is constexpr because use of non-constepxr lambda
+    // TODO: right now return value is not constexpr because use of non-constepxr lambda
    __host__ __device__ static constexpr LowerIndex CalculateLowerIndexDiff(
        const UpperIndex& idx_up_diff, const UpperIndex& idx_up_old, const LowerIndex& idx_low_old)
    {
--- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp
@@ -73,7 +73,7 @@ struct BlockwiseGenericTensorSliceCopy_v4
    __device__ void RunLoadThreadBuffer(const BlockSrcData* p_block_src,
                                        ThreadBufferData* p_thread_buffer) const
    {
-#if 0
+#if 1
        mThreadwiseLoad.template Run<BlockSrcData,
                                     ThreadBufferData,
                                     BlockSrcAddressSpace,
@@ -94,11 +94,11 @@ struct BlockwiseGenericTensorSliceCopy_v4
    __device__ void RunStoreThreadBuffer(const ThreadBufferData* p_thread_buffer,
                                         BlockDstData* p_block_dst) const
    {
-#if 0
+#if 1
        mThreadwiseStore.template Run<ThreadBufferData,
-                                              BlockDstData,
-                                              ThreadBufferAddressSpace,
-                                              BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
+                                      BlockDstData,
+                                      ThreadBufferAddressSpace,
+                                      BlockDstAddressSpace>(p_thread_buffer, p_block_dst);
 #else // tweaking
        mThreadwiseStore.template Run_optimized_dst_address_calculation<ThreadBufferData,
                                                                        BlockDstData,
--- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
+++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp
@@ -226,14 +226,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
        constexpr auto src_linear_dim_mask    = SrcDesc::GetLinearDimensionMask();
        constexpr auto src_nonlinear_dim_mask = SrcDesc::GetNonLinearDimensionMask();

-#if 0 // debug
-        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
-        {
-            print_sequence("src_linear_dim_mask", src_linear_dim_mask);
-            print_sequence("src_nonlinear_dim_mask", src_nonlinear_dim_mask);
-        }
-#endif
-
        static_assert(src_linear_dim_mask.At(VectorAccessDim) ||
                          long_vector_size == SrcDataPerAccess,
                      "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance "
@@ -295,18 +287,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    const auto src_coord =
                        src_nonlinear_coord + (linear_dim_data_steps + scalar_id);

-// this is src compile-time offset
-#if 0
-                    // TODO: is this good implementation?
+#if 1 // tweaking
+                    // this is src compile-time offset
                    const index_t src_linear_offset =
                        src_coord.GetOffset() - src_nonlinear_coord.GetOffset();
-#elif 0
+#else
+                    // this is src compile-time offset
                    const index_t src_linear_offset =
-                        SrcDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
-                        SrcDesc::CalculateOffset(make_zero_array<index_t, nDim>());
-#elif 1
-                    const index_t src_linear_offset =
-                        src_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
+                        src_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #endif

                    // Check src vector's padding situation, only check the first data in
@@ -396,14 +384,6 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
        constexpr auto dst_linear_dim_mask    = DstDesc::GetLinearDimensionMask();
        constexpr auto dst_nonlinear_dim_mask = DstDesc::GetNonLinearDimensionMask();

-#if 0 // debug
-        if(get_block_1d_id() == 0 && get_thread_local_1d_id() == 0)
-        {
-            print_sequence("dst_linear_dim_mask", dst_linear_dim_mask);
-            print_sequence("dst_nonlinear_dim_mask", dst_nonlinear_dim_mask);
-        }
-#endif
-
        static_assert(dst_linear_dim_mask.At(VectorAccessDim) ||
                          long_vector_size == DstDataPerAccess,
                      "Warning! VectorAccessDim is not DstDesc's linear dimension, performance "
@@ -496,18 +476,14 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2
                    const auto dst_coord =
                        dst_nonlinear_coord + (linear_dim_data_steps + scalar_id);

-// this is dst compile-time offset
-#if 0
-                    // TODO: is this good implementation?
+#if 1 // tweaking
+                    // this is dst compile-time offset
                    const index_t dst_linear_offset =
                        dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset();
-#elif 0
+#else
+                    // this is dst compile-time offset
                    const index_t dst_linear_offset =
-                        DstDesc::CalculateOffset(linear_dim_data_steps + scalar_id) -
-                        DstDesc::CalculateOffset(make_zero_array<index_t, nDim>());
-#elif 1
-                    const index_t dst_linear_offset =
-                        dst_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
+                        dst_nonlinear_coord.CalculateOffsetDiff(linear_dim_data_steps + scalar_id);
 #endif

                    // Check dst vector's padding situation, only check the first data in
--- a/composable_kernel/include/utility/config_amd.hpp.in
+++ b/composable_kernel/include/utility/config_amd.hpp.in
@@ -6,7 +6,7 @@

 #define CK_UNSIGNED_INDEX_TYPE 0
 #define CK_DEVICE_BACKEND_AMD 1
-#define CK_USE_AMD_INTRINSIC 0
+#define CK_USE_AMD_INTRINSIC 1
 #define CK_USE_AMD_INLINE_ASM 1
 #define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1
 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1
--- a/driver/src/driver.cpp
+++ b/driver/src/driver.cpp
@@ -74,7 +74,7 @@ int main(int argc, char* argv[])
 {
    using namespace ck;

-#if 1
+#if 0
    constexpr index_t N  = 128;
    constexpr index_t C  = 128;
    constexpr index_t HI = 17;