diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp index d9d948415f..6cedb7f02a 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v1r3_chwn_cyxk_khwn_lds_double_buffer.hpp @@ -7,6 +7,10 @@ #include "blockwise_generic_tensor_slice_copy.hpp" #include "threadwise_generic_tensor_slice_copy.hpp" #include "blockwise_batched_gemm.hpp" +#include "blockwise_2d_tensor_op.hpp" +#include "blockwise_4d_tensor_op.hpp" +#include "threadwise_tensor_slice_copy.hpp" +#include "threadwise_4d_tensor_op.hpp" namespace ck { @@ -129,10 +133,20 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer constexpr auto out_k_h_w_n_thread_desc = make_ConstantTensorDescriptor_packed( Sequence{}); +#if 1 // blockwise copy // input: format is [C, Hi, Wi, N] + const auto blockwise_in_copy = + Blockwise4dTensorCopy3{}; +#else auto blockwise_in_copy = - BlockwiseGenericTensorSliceCopy_v2({0, 0, 0, 0}, {0, 0, 0, 0}); +#endif +#if 1 // blockwise wei copy // format is [CPerBlock, X * KPerBlock] const auto blockwise_wei_copy = - BlockwiseGenericTensorSliceCopy_v2({0, 0}, {0, 0}); +#else + const auto blockwise_wei_copy = + BlockwiseGenericTensorSliceCopy_v1({0, 0}, {0, 0}); +#endif // a series of blockwise batched GEMM // C_matrix += transpose(A_matrix) * B_matrix @@ -402,7 +427,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer wo_block_data_begin + wo_thread_data_begin, n_block_data_begin + n_thread_data_begin); - ThreadwiseGenericTensorSliceCopy_v2r1{}); +#else + ThreadwiseGenericTensorSliceCopy_v1r1::type, @@ -413,6 +446,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer OutThreadCopyDataPerAccess_N>( make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); +#endif }).Else([&](auto fwd) { static_assert(fwd(GemmNPerThreadSubC) >= NPerBlock && NPerThread == NPerBlock && GemmNPerThreadSubC % NPerThread == 0, @@ -460,7 +494,15 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer wo_block_data_begin + wo_thread_data_begin, n_block_data_begin + n_thread_data_begin); - ThreadwiseGenericTensorSliceCopy_v2r1{}); +#else + ThreadwiseGenericTensorSliceCopy_v1r1::type, @@ -471,6 +513,7 @@ struct GridwiseConvolutionImplicitGemm_v1r3_chwn_cyxk_khwn_lds_double_buffer OutThreadCopyDataPerAccess_N>( make_zero_array(), make_zero_array()) .Run(p_out_thread, p_out_thread_on_global); +#endif }); } }; diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 77ed7c052b..8fa701ccee 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -301,14 +301,14 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp index 11908b0816..8fb5c79e07 100644 --- a/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp +++ b/driver/include/device_convolution_implicit_gemm_v1_chwn_cyxk_khwn.hpp @@ -143,7 +143,7 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t WeiBlockCopyDataPerAccess_K = 4; constexpr index_t OutThreadCopyDataPerAccess_N = 2; -#elif 1 +#elif 0 // for 3x3, 34x34, v1r3, Pascal // for 3x3, 28x28, v1r3, Pascal // for 3x3, 14x14, v1r3, Pascal @@ -266,9 +266,12 @@ void device_convolution_implicit_gemm_v1_chwn_cyxk_khwn(InDesc, constexpr index_t GemmDataPerReadA = 4; constexpr index_t GemmDataPerReadB = 4; + using InBlockCopySubLengths_CHWN = Sequence<1, 1, 1, 4>; using InBlockCopyClusterLengths_CHWN = Sequence<8, 2, 4, 4>; constexpr index_t InBlockCopyDataPerAccess_N = 4; + using WeiBlockCopySubLengths_CK = Sequence<1, 4>; + using WeiBlockCopyClusterLengths_CK = Sequence<8, 32>; constexpr index_t WeiBlockCopyDataPerAccess_K = 4; constexpr index_t OutThreadCopyDataPerAccess_N = 4; diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 7ea05e243e..40d6c5b6fe 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -71,7 +71,7 @@ int main(int argc, char* argv[]) { using namespace ck; -#if 1 +#if 0 constexpr index_t N = 64; constexpr index_t C = 1536; constexpr index_t HI = 8;