diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp index 379ca76970..3f5c3dd1bd 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp @@ -184,6 +184,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw Sequence{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index a124a67e2e..5d2ff88477 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -207,6 +207,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded Sequence{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index 630f852e6d..9666e3532d 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -207,6 +207,12 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf Sequence{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp index a3ecf64941..e2ab620a83 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw.hpp @@ -159,6 +159,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw Sequence{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp index 1bc814c009..e3c31ed36a 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer.hpp @@ -159,6 +159,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_lds_double_buffer Sequence{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor // this copy operator already have blockwise offset built-in diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp index fbe387e788..98f942526c 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded.hpp @@ -172,6 +172,12 @@ struct GridwiseConvolutionImplicitGemm_v4r4_nchw_kcyx_nkhw_padded Sequence{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // weight blockwise copy auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v4{}, Number{}); + // this check is ad-hoc + // TODO: need to properly implement tensor descriptor with multiple alignment + // requirements + static_assert(wei_e_k_block_desc.GetStride(I0) % GemmDataPerReadA == 0, + "GemmDataPerReadA alignment requirement is not satisfied"); + // weight blockwise copy auto blockwise_wei_copy = BlockwiseGenericTensorSliceCopy_v4::type{}); } -template -__host__ __device__ constexpr auto -vectorize_tensor_descriptor(LowerTensorDescriptor, Number vector_dim, Number) -{ - constexpr index_t nDim = LowerTensorDescriptor::GetNumOfDimension(); - - return transform_tensor_descriptor( - LowerTensorDescriptor{}, - Vectorize{}, - typename arithmetic_sequence_gen<0, nDim, 1>::type{}, - typename arithmetic_sequence_gen<0, nDim, 1>::type{}); -} - template __host__ __device__ void print_tensor_descriptor(const char* s, const NativeTensorDescriptor& desc) diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 4890b424c0..a5e03506b5 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -14,7 +14,7 @@ //#include "device_convolution_implicit_gemm_v1_nchw_cyxk_nkhw.hpp" //#include "device_convolution_implicit_gemm_v2_chwn_cyxk_khwn.hpp" //#include "device_convolution_implicit_gemm_v3_nchw_cyxk_nkhw.hpp" -//#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" +#include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw.hpp" #include "device_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp" //#include "device_convolution_implicit_gemm_v4r2_nchw_kcyx_nkhw.hpp" //#include "device_convolution_implicit_gemm_v4r3_nchw_kcyx_nkhw.hpp" @@ -295,7 +295,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<0, 0>; using RightPads = Sequence<0, 0>; -#elif 0 +#elif 1 // 3x3 filter, 2x2 stride, 35x35 input, 17x17 output // cudnn@V100 90%, ck@V100 93%, ck@P100 83%, ck@VII 81% constexpr index_t N = 128; @@ -341,7 +341,7 @@ int main(int argc, char* argv[]) using LeftPads = Sequence<3, 0>; using RightPads = Sequence<3, 0>; -#elif 1 +#elif 0 // 1x7 filter, 0x3 pad, 17x17 input constexpr index_t N = 128; constexpr index_t C = 128; @@ -486,7 +486,7 @@ int main(int argc, char* argv[]) ConvStrides{}, ConvDilations{}, nrepeat); -#elif 1 +#elif 0 device_convolution_implicit_gemm_v4r4_nchw_kcyx_nkhw_padded(in_nchw_desc, in_nchw, wei_kcyx_desc,