diff --git a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp index 2b8cb3b7dc..618d330534 100644 --- a/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp +++ b/driver/device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw.hpp @@ -75,14 +75,14 @@ void device_convolution_implicit_gemm_v4_nchw_kcyx_nkhw(InDesc, constexpr index_t InBlockCopySrcDataPerRead_B = 1; constexpr index_t InBlockCopyDstDataPerWrite_N2 = 4; - using WeiBlockCopySubLengths_E_K = Sequence<1, 4>; - using WeiBlockCopyClusterLengths_E_K = Sequence<8, 32>; - using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [E, K] - using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [E, K] - using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [K, E] + using WeiBlockCopySubLengths_E_K = Sequence<4, 1>; + using WeiBlockCopyClusterLengths_E_K = Sequence<2, 128>; + using WeiBlockCopyThreadClusterArrangeOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopySrcAccessOrder = Sequence<1, 0>; // [K, E] + using WeiBlockCopyDstAccessOrder = Sequence<0, 1>; // [E, K] - constexpr index_t WeiBlockCopySrcDataPerRead_E = 1; - constexpr index_t WeiBlockCopyDstDataPerWrite_K = 4; + constexpr index_t WeiBlockCopySrcDataPerRead_E = 4; + constexpr index_t WeiBlockCopyDstDataPerWrite_K = 1; #endif constexpr index_t GridSize =