diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 2a934fb9cb..be747eaca3 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -438,7 +438,14 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf 0, b_thread_data_on_global, 0}) - .template Run_amd_experiment(p_out_thread, p_out_global); +#if 0 + .Run_generic +#elif 1 + .template Run_generic +#elif 1 + .template Run_optimized_dst_address_calculation +#endif + (p_out_thread, p_out_global); } } }; diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 5b4805e9ee..223d0d5bed 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -325,14 +325,14 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 831088ab25..62dc8b4c9a 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -188,7 +188,7 @@ struct TensorCoordinate_v2 private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -196,7 +196,7 @@ struct TensorCoordinate_v2 template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp index 25349dc9f9..e4a9af67be 100644 --- a/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/blockwise_generic_tensor_slice_copy.hpp @@ -742,12 +742,15 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunLoadRegisterBuffer(const TData* p_src, TData* p_buffer) const { #if 0 - mThreadwiseLoad.Run(p_src, p_buffer); + mThreadwiseLoad.Run_generic(p_src, p_buffer); #elif 1 - mThreadwiseLoad.Run_access_order_optimized_for_source_index_calculation(p_src, p_buffer); -#elif 0 - // hardcoded: global to register - mThreadwiseLoad.template Run_amd_experiment(p_src, p_buffer); + // hardcoded: src is global memory + mThreadwiseLoad.template Run_generic(p_src, p_buffer); +#elif 1 + // hardcoded: src is global memory + mThreadwiseLoad + .template Run_optimized_src_address_calculation( + p_src, p_buffer); #endif } @@ -755,10 +758,15 @@ struct BlockwiseGenericTensorSliceCopy_v4 __device__ void RunStoreRegisterBuffer(const TData* p_buffer, TData* p_dst) const { #if 0 - mThreadwiseStore.Run(p_buffer, p_dst); + mThreadwiseStore.Run_generic(p_buffer, p_dst); #elif 1 - // hardcoded: register to LDS - mThreadwiseStore.template Run_amd_experiment(p_buffer, p_dst); + // hardcoded: dst is lds + mThreadwiseStore.template Run_generic(p_buffer, p_dst); +#elif 1 + // hardcoded: dst is lds + mThreadwiseStore + .template Run_optimized_dst_address_calculation(p_buffer, + p_dst); #endif } diff --git a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp index d8ecf6508b..99148042f2 100644 --- a/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp +++ b/composable_kernel/include/tensor_operation/threadwise_generic_tensor_slice_copy.hpp @@ -21,10 +21,6 @@ #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 #endif -#ifndef CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 -#define CK_EXPERIMENTAL_USE_AMD_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 -#endif - namespace ck { // This threadwise copy allow vector access of src and dst. @@ -36,11 +32,11 @@ namespace ck { // device memory or LDS. // When copying large amout of data, let's hope compiler will reduce register // used for the buffer. -template + template __device__ void Run(const TData* p_src, TData* p_dst) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); @@ -262,10 +258,10 @@ struct ThreadwiseGenericTensorSliceCopy_v1r1 // The dimension access order should be the same on src and dst. // It is designed for cases, where one of src and dst is register, and // the other is device memory or LDS -template @@ -328,7 +324,7 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 mDstSliceOrigin = dst_slice_origin; } - template + template __device__ void Run(const TData* p_src, TData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; @@ -443,11 +439,11 @@ struct ThreadwiseGenericTensorSliceCopy_v1r2 // device memory or LDS. // When copying large amout of data, let's hope compiler will reduce register // used for the buffer. -template + template struct IsolateMergedDimLengths { - template + template __device__ constexpr index_t operator()(IDim idim) const { return TDesc::ContainMultipleOriginalDimensions(idim) ? Lengths{}[idim] : 1; } }; - template + template __device__ void Run(const TData* p_src, TData* p_dst) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); @@ -765,7 +761,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 0: VGPR // 1: LDS // 2: global-memory - template + template __device__ void Run_amd_experiment(const TData* p_src, TData* p_dst) const { constexpr auto buffer_desc = make_ConstantTensorDescriptor_packed(SliceLengths{}); @@ -839,8 +835,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. src_normal_offset must be calculatd at compile time (guaranteed) // 3. src_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && \ - CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE vector_data = __buffer_load( p_src, static_cast(src_merged_offset), @@ -940,8 +935,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 // 2. dst_normal_offset must be calculatd at compile time (guaranteed) // 3. dst_merged_offset can be runtime value (no assumption imposed) static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && \ - CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE __buffer_store( vector_data, p_dst, dst_merged_offset, dst_normal_offset); #else @@ -959,7 +953,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 } // T can be Sequence or Array - template + template __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) { static_if{}([&](auto) { @@ -967,7 +961,7 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); } - template + template __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) { static_if{}([&](auto) { @@ -981,11 +975,11 @@ struct ThreadwiseGenericTensorSliceCopy_v2r1 }; // this version use TensorView and TensorCoordinate -template + template __device__ void MoveSrcSliceWindow(T step_sizes, integral_constant) { mSrc.MoveSliceWindow(mSrcSlice, step_sizes, integral_constant{}); } - template + template __device__ void MoveDstSliceWindow(T step_sizes, integral_constant) { mDst.MoveSliceWindow(mDstSlice, step_sizes, integral_constant{}); @@ -1187,8 +1181,12 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 mDstSliceOrigin = dst_slice_origin; } - template - __device__ void Run(const TData* p_src, TData* p_dst) const + // Will do padding check on src data: Read 0 if src data is in padding area. + // Will do padding check on dst data: No write if dst data is in paddin area. + template + __device__ void Run_generic(const TData* p_src, TData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -1214,7 +1212,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 // buffer to hold a long-vector TData p_long_vector[long_vector_size]; - // set 0 + // zero out buffer for(index_t i = 0; i < long_vector_size; ++i) { p_long_vector[i] = 0; @@ -1226,18 +1224,29 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 auto scalar_id = make_zero_array(); scalar_id(vector_access_dim) = i * src_data_per_access; + const index_t buffer_offset = i * src_data_per_access; + const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id); - // check for padding - // TODO: still kind of messy + // Check src vector's padding situation, only check the first data in this src + // vector. It's user's responsiblity to make sure all data in the src vector has + // the same padding situation + // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is neccessary if(!src_coord.IsAnyLevelIndexInPaddingArea()) { - const index_t src_offset = src_coord.GetOffset(); - - const index_t buffer_offset = i * src_data_per_access; - - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + *reinterpret_cast(&p_long_vector[buffer_offset]) = + __buffer_load(p_src, src_coord.GetOffset(), 0); +#else + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_coord.GetOffset()]); +#endif + }).Else([&](auto) { + // src can be all kinds of memory-space. + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast(&p_src[src_coord.GetOffset()]); + }); } } @@ -1249,24 +1258,53 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 const index_t buffer_offset = i * dst_data_per_access; - const index_t dst_offset = - (mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset(); + const auto dst_coord = mDstSliceOrigin + (long_vector_data_begin_id + scalar_id); - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); +// Check dst vector's padding situation, only check the first data in this dst +// vector. It's user's responsiblity to make sure all data in the dst vector has +// the same padding situation +// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is neccessary +#if 0 // tuning + if(!dst_coord.IsAnyLevelIndexInPaddingArea()) +#endif + { + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + __buffer_store( + *reinterpret_cast(&p_long_vector[buffer_offset]), + p_dst, + dst_coord.GetOffset(), + 0); +#else + *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); +#endif + }).Else([&](auto) { + // dst can be all kinds of memory-space + *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + }); + } } }); } + // Modify Length to 1, if Mask is set to false + // Used for isolating linear dimension from non-linear dimensions template __device__ static constexpr auto mask_lengths(Sequence, Sequence) { return Sequence<(Mask ? Lengths : 1)...>{}; } - template - __device__ void Run_access_order_optimized_for_source_index_calculation(const TData* p_src, - TData* p_dst) const + // p_src must be global-memory, p_dst can be any memory-space. + // User should make sure p_src is a block-invariant pointer, because + // buffer_load is used for loading from global-memory into register buffer. + // Will do padding check on src data: Read 0 if src data is in padding area. + // Will do padding check on dst data: No write if dst data is in paddin area. + // This version is optimized for address calculation of src tensor + template + __device__ void Run_optimized_src_address_calculation(const TData* p_src, TData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -1281,11 +1319,16 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto long_vector_access_lengths = SliceLengths::Modify( vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - // TODO:: don't use hack + // TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask() + // is implemented constexpr auto src_linear_dim_mask = SrcLinearDimensionMask{}; constexpr auto src_nonlinear_dim_mask = SrcNonLinearDimensionMask{}; - // separate steps into linear and non-linear components + static_assert( + src_linear_dim_mask.At(VectorAccessDim) || long_vector_size == SrcDataPerAccess, + "Warning! VectorAccessDim is not SrcDesc's linear dimension, performance would drop"); + + // separate steps into linear and non-linear components, accoording to src tensor constexpr auto linear_long_vector_access_lengths = mask_lengths(long_vector_access_lengths, src_linear_dim_mask); @@ -1293,88 +1336,122 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 mask_lengths(long_vector_access_lengths, src_nonlinear_dim_mask); // loop over src's non-linear dimensions - ford{}( - [&](auto nonlinear_dim_long_vector_access_id) { + ford{}([&]( + auto nonlinear_dim_long_vector_access_id) { - // step-sizes along src's nonlinear dimensions - auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id; - nonlinear_dim_data_steps(vector_access_dim) = - long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim]; + // calculate step-sizes along src's nonlinear dimensions + auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id; + nonlinear_dim_data_steps(vector_access_dim) = + long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim]; - // move src cooridnate along nonlinear dimensions - const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps; + // move src cooridnate along nonlinear dimensions + // this coordinate contains run-time per-thread offset + const auto src_nonlinear_coord = mSrcSliceOrigin + nonlinear_dim_data_steps; - // loop over src's linear dimensions - ford{}( - [&](auto linear_dim_long_vector_access_id) { + // loop over src's linear dimensions + ford{}([&]( + auto linear_dim_long_vector_access_id) { - // step-sizes along src's linear dimensions - auto linear_dim_data_steps = linear_dim_long_vector_access_id; - linear_dim_data_steps(vector_access_dim) = - long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; + // step-sizes along src's linear dimensions + auto linear_dim_data_steps = linear_dim_long_vector_access_id; + linear_dim_data_steps(vector_access_dim) = + long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; - // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; - // set 0 - for(index_t i = 0; i < long_vector_size; ++i) - { - p_long_vector[i] = 0; - } + // zero out buffer + for(index_t i = 0; i < long_vector_size; ++i) + { + p_long_vector[i] = 0; + } - // load data from src to the long-vector buffer - for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * src_data_per_access; + // Loop over VectorAccessDim, and load data from src to the + // long-vector buffer. + // If VectorAccessDim is src's linear dimension, then src's + // offset-diff due to this looping is known at compile-time. If + // VectorAccessDim is src's nonlinear dimension, then src's + // offset-diff due to this looping is only known at run-time. For best + // performance, VectorAccessDim, should be src's linear dimension + for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * src_data_per_access; - // move src cooridnate along linear dimensions - const auto src_coord = - src_nonlinear_coord + (linear_dim_data_steps + scalar_id); + const index_t buffer_offset = i * src_data_per_access; - // TODO: good implementation? - const index_t src_linear_offset_diff = - src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); + // move src cooridnate along linear dimensions + const auto src_coord = + src_nonlinear_coord + (linear_dim_data_steps + scalar_id); - // check for padding - // TODO: still kind of messy - if(!src_coord.IsAnyLevelIndexInPaddingArea()) - { - const index_t src_offset = src_coord.GetOffset(); + // this is src compile-time offset + // TODO: is this good implementation? + const index_t src_linear_offset = + src_coord.GetOffset() - src_nonlinear_coord.GetOffset(); - const index_t buffer_offset = i * src_data_per_access; + // Check src vector's padding situation, only check the first data in + // this src vector. It's user's responsiblity to make sure all data in + // the src vector has the same padding situation + // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is + // neccessary + if(!src_coord.IsAnyLevelIndexInPaddingArea()) + { + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + *reinterpret_cast(&p_long_vector[buffer_offset]) = + __buffer_load( + p_src, src_nonlinear_coord.GetOffset(), src_linear_offset); +#else + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast( + &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); +#endif + }).Else([&](auto) { + *reinterpret_cast(&p_long_vector[buffer_offset]) = + *reinterpret_cast( + &p_src[src_nonlinear_coord.GetOffset() + src_linear_offset]); + }); + } + } - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); - } - } + // store data from the long-vector buffer to dst + for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * dst_data_per_access; - // store data from the long-vector buffer to dst - for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * dst_data_per_access; + const index_t buffer_offset = i * dst_data_per_access; - const index_t buffer_offset = i * dst_data_per_access; + // dst offset is calculated here, without explicitly separating into + // compile-time and per-thread component + const auto dst_coord = mDstSliceOrigin + (nonlinear_dim_data_steps + + linear_dim_data_steps + scalar_id); - const index_t dst_offset = - (mDstSliceOrigin + - (nonlinear_dim_data_steps + linear_dim_data_steps + scalar_id)) - .GetOffset(); - - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); - } - }); +// Check dst vector's padding situation, only check the first data in +// this dst vector. It's user's responsiblity to make sure all data in +// the dst vector has the same padding situation +// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is +// neccessary +#if 0 // tuning + if(!dst_coord.IsAnyLevelIndexInPaddingArea()) +#endif + { + *reinterpret_cast(&p_dst[dst_coord.GetOffset()]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + } + } }); + }); } - // memory-space - // 0: VGPR - // 1: LDS - // 2: global-memory - template - __device__ void Run_amd_experiment(const TData* p_src, TData* p_dst) const + // p_src could be any memory space, d_dst must be global memory. + // User should make sure p_dst is a block-invariant pointer, because + // buffer_load is used for storing data from regsiter buffer into global-memory. + // Will do padding check on src data: Read 0 if src data is in padding area. + // Will do padding check on dst data: No write if dst data is in paddin area. + // This version is optimized for address calculation of dst tensor + template + __device__ void Run_optimized_dst_address_calculation(const TData* p_src, TData* p_dst) const { using src_vector_t = typename vector_type::MemoryType; using dst_vector_t = typename vector_type::MemoryType; @@ -1389,90 +1466,134 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 constexpr auto long_vector_access_lengths = SliceLengths::Modify( vector_access_dim, SliceLengths::Get(vector_access_dim) / long_vector_size); - ford{}([&]( - auto long_vector_access_id) { + // TODO:: stop using this hack, once TransformedTensorDescriptor::GetLinearDimensionMask() + // is implemented + constexpr auto dst_linear_dim_mask = DstLinearDimensionMask{}; + constexpr auto dst_nonlinear_dim_mask = DstNonLinearDimensionMask{}; - // data id w.r.t slicing-window - auto long_vector_data_begin_id = long_vector_access_id; - long_vector_data_begin_id(vector_access_dim) = - long_vector_size * long_vector_access_id[vector_access_dim]; + static_assert( + dst_linear_dim_mask.At(VectorAccessDim) || long_vector_size == DstDataPerAccess, + "Warning! VectorAccessDim is not DstDesc's linear dimension, performance would drop"); - // buffer to hold a long-vector - TData p_long_vector[long_vector_size]; + // separate steps into linear and non-linear components, accoording to dst tensor + constexpr auto linear_long_vector_access_lengths = + mask_lengths(long_vector_access_lengths, dst_linear_dim_mask); - // set 0 - for(index_t i = 0; i < long_vector_size; ++i) - { - p_long_vector[i] = 0; - } + constexpr auto nonlinear_long_vector_access_lengths = + mask_lengths(long_vector_access_lengths, dst_nonlinear_dim_mask); - // load data from src to the long-vector buffer - for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * src_data_per_access; + // loop over dst's non-linear dimensions + ford{}([&]( + auto nonlinear_dim_long_vector_access_id) { - const auto src_coord = mSrcSliceOrigin + (long_vector_data_begin_id + scalar_id); + // calculate step-sizes along dst's nonlinear dimensions + auto nonlinear_dim_data_steps = nonlinear_dim_long_vector_access_id; + nonlinear_dim_data_steps(vector_access_dim) = + long_vector_size * nonlinear_dim_long_vector_access_id[vector_access_dim]; - // check for padding - // TODO: still kind of messy - if(!src_coord.IsAnyLevelIndexInPaddingArea()) + // move dst cooridnate along nonlinear dimensions + // this coordinate contains run-time per-thread offset + const auto dst_nonlinear_coord = mDstSliceOrigin + nonlinear_dim_data_steps; + + // loop over dst's linear dimensions + ford{}([&]( + auto linear_dim_long_vector_access_id) { + + // step-sizes along dst's linear dimensions + auto linear_dim_data_steps = linear_dim_long_vector_access_id; + linear_dim_data_steps(vector_access_dim) = + long_vector_size * linear_dim_long_vector_access_id[vector_access_dim]; + + // buffer to hold a long-vector + TData p_long_vector[long_vector_size]; + + // zero out buffer + for(index_t i = 0; i < long_vector_size; ++i) { - const index_t src_offset = src_coord.GetOffset(); + p_long_vector[i] = 0; + } + + // Loop over VectorAccessDim, and load data from src to the + // long-vector buffer. + // If VectorAccessDim is dst's linear dimension, then dst's + // offset-diff due to this looping is known at compile-time. If + // VectorAccessDim is dst's nonlinear dimension, then dst's + // offset-diff due to this looping is only known at run-time. For best + // performance, VectorAccessDim, should be dst's linear dimension + for(index_t i = 0; i < long_vector_size / src_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * src_data_per_access; const index_t buffer_offset = i * src_data_per_access; - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && \ - CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 + // src offset is calculated here, without explicitly separating into + // compile-time and per-thread component + const auto src_coord = mSrcSliceOrigin + (nonlinear_dim_data_steps + + linear_dim_data_steps + scalar_id); + + // Check src vector's padding situation, only check the first data in + // this src vector. It's user's responsiblity to make sure all data in + // the src vector has the same padding situation + // TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is + // neccessary + if(!src_coord.IsAnyLevelIndexInPaddingArea()) + { *reinterpret_cast(&p_long_vector[buffer_offset]) = - __buffer_load( - p_src, static_cast(src_offset), static_cast(0)); -#else - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); -#endif - }).Else([&](auto) { - // src can be all kinds of memory-space. - *reinterpret_cast(&p_long_vector[buffer_offset]) = - *reinterpret_cast(&p_src[src_offset]); - }); + *reinterpret_cast(&p_src[src_coord.GetOffset()]); + } } - } - // store data from the long-vector buffer to dst - for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) - { - auto scalar_id = make_zero_array(); - scalar_id(vector_access_dim) = i * dst_data_per_access; + // store data from the long-vector buffer to dst + for(index_t i = 0; i < long_vector_size / dst_data_per_access; ++i) + { + auto scalar_id = make_zero_array(); + scalar_id(vector_access_dim) = i * dst_data_per_access; - const index_t buffer_offset = i * dst_data_per_access; + const index_t buffer_offset = i * dst_data_per_access; - const index_t dst_offset = - (mDstSliceOrigin + (long_vector_data_begin_id + scalar_id)).GetOffset(); + // move dst cooridnate along linear dimensions + const auto dst_coord = + dst_nonlinear_coord + (linear_dim_data_steps + scalar_id); - static_if{}([&](auto) { -#if CK_USE_AMD_INTRINSIC && \ - CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 - __buffer_store( - *reinterpret_cast(&p_long_vector[buffer_offset]), - p_dst, - dst_offset, - 0); -#else - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); + // this is dst compile-time offset + // TODO: is this good implementation? + const index_t dst_linear_offset = + dst_coord.GetOffset() - dst_nonlinear_coord.GetOffset(); + +// Check dst vector's padding situation, only check the first data in +// this dst vector. It's user's responsiblity to make sure all data in +// the dst vector has the same padding situation +// TODO: not sure a dedicated IsAnyLevelIndexInPaddingArea() function is +// neccessary +#if 0 // tuning + if(!dst_coord.IsAnyLevelIndexInPaddingArea()) #endif - }).Else([&](auto) { - // dst can be all kinds of memory-space - *reinterpret_cast(&p_dst[dst_offset]) = - *reinterpret_cast(&p_long_vector[buffer_offset]); - }); - } + { + static_if{}([&](auto) { +#if CK_USE_AMD_INTRINSIC && CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE + __buffer_store( + *reinterpret_cast(&p_long_vector[buffer_offset]), + p_dst, + dst_nonlinear_coord.GetOffset(), + dst_linear_offset); +#else + *reinterpret_cast( + &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); +#endif + }).Else([&](auto) { + *reinterpret_cast( + &p_dst[dst_nonlinear_coord.GetOffset() + dst_linear_offset]) = + *reinterpret_cast(&p_long_vector[buffer_offset]); + }); + } + } + }); }); } - template + template __device__ void MoveSrcSliceWindow(const T& step_sizes_, integral_constant) { @@ -1483,7 +1604,7 @@ struct ThreadwiseGenericTensorSliceCopy_v4r2 }).Else([&](auto) { mSrcSliceOrigin -= step_sizes; }); } - template + template __device__ void MoveDstSliceWindow(const T& step_sizes_, integral_constant) { diff --git a/composable_kernel/include/utility/common_header.hpp b/composable_kernel/include/utility/common_header.hpp index f8b52bba5b..ad6b26735a 100644 --- a/composable_kernel/include/utility/common_header.hpp +++ b/composable_kernel/include/utility/common_header.hpp @@ -22,7 +22,7 @@ #include "amd_inline_asm.hpp" #endif -#if CK_USE_AMD_INTRINCIS +#if CK_USE_AMD_INTRINSIC #include "amd_intrinsic.hpp" #endif diff --git a/composable_kernel/include/utility/config_amd.hpp.in b/composable_kernel/include/utility/config_amd.hpp.in index 0ee722f22d..dffd6fd08b 100644 --- a/composable_kernel/include/utility/config_amd.hpp.in +++ b/composable_kernel/include/utility/config_amd.hpp.in @@ -8,7 +8,7 @@ #define CK_DEVICE_BACKEND_AMD 1 #define CK_USE_AMD_INTRINSIC 1 #define CK_USE_AMD_INLINE_ASM 1 -#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 1 +#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 1 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 @@ -16,6 +16,14 @@ namespace ck { +enum address_space_t +{ + generic = 0, + vgpr = 1, + lds = 2, + global = 3 +}; + #if CK_UNSIGNED_INDEX_TYPE using index_t = uint32_t; #else diff --git a/composable_kernel/include/utility/config_nvidia.hpp.in b/composable_kernel/include/utility/config_nvidia.hpp.in index 523b7be589..daab02d140 100644 --- a/composable_kernel/include/utility/config_nvidia.hpp.in +++ b/composable_kernel/include/utility/config_nvidia.hpp.in @@ -10,7 +10,7 @@ #define CK_DEVICE_BACKEND_NVIDIA 1 #define CK_USE_AMD_INTRINSIC 0 #define CK_USE_AMD_INLINE_ASM 0 -#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V2R1 0 +#define CK_USE_AMD_INTRINSIC_BUFFER_LOAD_STORE 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_BLOCKWISE_GENERIC_SLICE_COPY_V1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R1 0 #define CK_EXPERIMENTAL_USE_MORE_COMPILE_STATIC_THREADWISE_GENERIC_TENSOR_SLICE_COPY_V1R2 0 @@ -18,6 +18,11 @@ namespace ck { +enum address_space_t +{ + generic = 0 +}; + #if CK_UNSIGNED_INDEX_TYPE using index_t = uint32_t; #else