diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp index f0ed466eef..6f92b30617 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded.hpp @@ -185,6 +185,7 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded InBlockCopyDstDataPerWrite_N2>( {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); +#if 0 // weight tensor // tensor descriptor in device memory, src of blockwise copy constexpr auto wei_e_k_global_desc = @@ -192,6 +193,13 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded make_tuple(Merge>{}, PassThrough{}), make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); +#else // hack + constexpr auto wei_e_k_global_desc_old = + WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + + constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor( + wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides()); +#endif // tensor descriptor in LDS, dst of blockwise copy // be careful of LDS alignment diff --git a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp index cce36456d8..011112e49f 100644 --- a/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp +++ b/composable_kernel/include/kernel_algorithm/gridwise_convolution_implicit_gemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buffer.hpp @@ -184,23 +184,27 @@ struct GridwiseConvolutionImplicitGemm_v4r1_nchw_kcyx_nkhw_padded_lds_double_buf InBlockCopyDstDataPerWrite_N2>( {0, 0, b_block_data_on_global, 0}, {0, 0, 0, 0}); +#if 0 // weight tensor // tensor descriptor in device memory, src of blockwise copy constexpr auto wei_e_k_global_desc = -#if 0 transform_tensor_descriptor(wei_k_c_y_x_global_desc, make_tuple(Merge>{}, PassThrough{}), make_tuple(Sequence<1, 2, 3>{}, Sequence<0>{}), make_tuple(Sequence<0>{}, Sequence<1>{})); #else // hack - make_native_tensor_descriptor_packed(Sequence{}); + constexpr auto wei_e_k_global_desc_old = + WeiGlobalDesc::Unfold(I1, I3).ReorderGivenNew2Old(Sequence<1, 0>{}); + + constexpr auto wei_e_k_global_desc = make_native_tensor_descriptor( + wei_e_k_global_desc_old.GetLengths(), wei_e_k_global_desc_old.GetStrides()); #endif - // tensor descriptor in LDS, dst of blockwise copy - // be careful of LDS alignment - constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( - Sequence{}, - Number{}); + // tensor descriptor in LDS, dst of blockwise copy + // be careful of LDS alignment + constexpr auto wei_e_k_block_desc = make_native_tensor_descriptor_aligned( + Sequence{}, + Number{}); // operator for blockwise copy of weight into LDS // slice a tensor, and copy it into another tensor diff --git a/composable_kernel/include/tensor_description/tensor_coordinate.hpp b/composable_kernel/include/tensor_description/tensor_coordinate.hpp index 1a2774b589..4600b682ac 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate.hpp @@ -313,14 +313,14 @@ struct TensorCoordinate private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantTensorDescriptor) + MakeDummyTensorCoordinate(ConstantTensorDescriptor) { return NormalTensorCoordinate>(); } template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) + MakeDummyTensorCoordinate(ConstantMergedTensorDescriptor) { return MergedTensorCoordinate>(); } diff --git a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp index 62dc8b4c9a..831088ab25 100644 --- a/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp +++ b/composable_kernel/include/tensor_description/tensor_coordinate_v2.hpp @@ -188,7 +188,7 @@ struct TensorCoordinate_v2 private: template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(NativeTensorDescriptor) + MakeDummyTensorCoordinate(NativeTensorDescriptor) { return NativeTensorCoordinate>( make_zero_array()); @@ -196,7 +196,7 @@ struct TensorCoordinate_v2 template __host__ __device__ static constexpr auto - MakeDummyTensorCoordinate(TransformedTensorDescriptor) + MakeDummyTensorCoordinate(TransformedTensorDescriptor) { return TransformedTensorCoordinate>( make_zero_array()); diff --git a/composable_kernel/include/tensor_description/tensor_descriptor.hpp b/composable_kernel/include/tensor_description/tensor_descriptor.hpp index a41dc1da67..df625c6ecb 100644 --- a/composable_kernel/include/tensor_description/tensor_descriptor.hpp +++ b/composable_kernel/include/tensor_description/tensor_descriptor.hpp @@ -85,12 +85,6 @@ struct NativeTensorDescriptor return offset; } - // TODO: remove this - __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const Index& idx) - { - return CalculateOffset(idx); - } - __host__ __device__ static constexpr index_t CalculateOffsetDiff(const Index& idx_diff) { index_t offset_diff = 0; @@ -227,13 +221,6 @@ struct TransformedTensorDescriptor return LowTensorDescriptor{}; } -#if 0 - __host__ __device__ static constexpr auto GetLowerLengths() - { - return GetLowerTensorDescriptor().GetLengths(); - } -#endif - struct lambda_GetUpperLengths { template @@ -359,12 +346,6 @@ struct TransformedTensorDescriptor return GetLowerTensorDescriptor().CalculateOffset(CalculateLowerIndex(idx_up)); } - // TODO: remove this - __host__ __device__ static constexpr index_t GetOffsetFromMultiIndex(const UpperIndex& idx_up) - { - return CalculateOffset(idx_up); - } - #if 0 template __host__ __device__ static constexpr bool IsLinearDimension(Number) diff --git a/driver/src/driver.cpp b/driver/src/driver.cpp index 045e796c38..8ae1b3eb35 100644 --- a/driver/src/driver.cpp +++ b/driver/src/driver.cpp @@ -49,7 +49,7 @@ struct GeneratorTensor_3 { std::array dims = {{static_cast(is)...}}; - auto f_acc = [](auto a, auto b) { return 100 * a + b; }; + auto f_acc = [](auto a, auto b) { return 10 * a + b; }; return std::accumulate(dims.begin(), dims.end(), index_t(0), f_acc); } @@ -75,19 +75,19 @@ int main(int argc, char* argv[]) using namespace ck; #if 0 - constexpr index_t N = 256; - constexpr index_t C = 64; - constexpr index_t HI = 17; - constexpr index_t WI = 17; - constexpr index_t K = 256; - constexpr index_t Y = 17; - constexpr index_t X = 17; + constexpr index_t N = 8; + constexpr index_t C = 8; + constexpr index_t HI = 2; + constexpr index_t WI = 8; + constexpr index_t K = 128; + constexpr index_t Y = 1; + constexpr index_t X = 1; using ConvStrides = Sequence<1, 1>; using ConvDilations = Sequence<1, 1>; - using LeftPads = Sequence<0, 3>; - using RightPads = Sequence<0, 3>; + using LeftPads = Sequence<0, 0>; + using RightPads = Sequence<0, 0>; #elif 0 // 3x3, 34x34 constexpr index_t N = 64; @@ -347,7 +347,7 @@ int main(int argc, char* argv[]) wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread); #elif 0 in_nchw.GenerateTensorValue(GeneratorTensor_1{}, num_thread); - wei_kcyx.GenerateTensorValue(GeneratorTensor_2{-5, 5}, num_thread); + wei_kcyx.GenerateTensorValue(GeneratorTensor_3{}, num_thread); #elif 0 in_nchw.GenerateTensorValue(GeneratorTensor_3{}, num_thread); wei_kcyx.GenerateTensorValue(GeneratorTensor_1{}, num_thread);