From e6e66fb1f9ba62491f0c397abd69c2dfff61fc8a Mon Sep 17 00:00:00 2001 From: Dipal M Zambare Date: Wed, 27 Nov 2019 19:10:16 +0530 Subject: [PATCH] Fixed reentrancy issues with bli_sgemm_small() and bli_dgemm_small(). Replaced global buffer used for packing with the buffer provided by memory pools. These buffers are checkout at the beginning of each call and return the pool once done. Please check comment in the above functions for details. Change-Id: I76b3560f7efcc621a4455e834fce06f629c38f50 --- kernels/zen/3/bli_gemm_small.c | 109 +++++++++++++++++++++++++++++++-- 1 file changed, 104 insertions(+), 5 deletions(-) diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c index 1db3b59ea..85d097247 100644 --- a/kernels/zen/3/bli_gemm_small.c +++ b/kernels/zen/3/bli_gemm_small.c @@ -44,12 +44,10 @@ #define BLIS_ENABLE_PREFETCH #define F_SCRATCH_DIM (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES) -static float A_pack[F_SCRATCH_DIM] __attribute__((aligned(64))); #define D_BLIS_SMALL_MATRIX_THRES (BLIS_SMALL_MATRIX_THRES / 2 ) #define D_BLIS_SMALL_M_RECT_MATRIX_THRES (BLIS_SMALL_M_RECT_MATRIX_THRES / 2) #define D_BLIS_SMALL_K_RECT_MATRIX_THRES (BLIS_SMALL_K_RECT_MATRIX_THRES / 2) #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES) -static double D_A_pack[D_SCRATCH_DIM] __attribute__((aligned(64))); #define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. #define AT_MR 4 // The kernel dimension of the A transpose GEMM kernel.(AT_MR * NR). static err_t bli_sgemm_small @@ -210,7 +208,10 @@ static err_t bli_sgemm_small alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); gint_t required_packing_A = 1; - + mem_t local_mem_buf_A_s; + float *A_pack = 0; + rntm_t rntm; + // when N is equal to 1 call GEMV instead of GEMM if (N == 1) { @@ -236,6 +237,41 @@ static err_t bli_sgemm_small { required_packing_A = 0; } + + /* + * This function was using global array to pack part of A input when needed. + * However, using this global array make the function non-reentrant. + * Instead of using a global array we should allocate buffer for each invocation. + * Since the buffer size is too big or stack and doing malloc every time will be too expensive, + * better approach is to get the buffer from the pre-allocated pool and return + * it the pool once we are doing. + * + * In order to get the buffer from pool, we need access to memory broker, + * currently this function is not invoked in such a way that it can receive + * the memory broker (via rntm). Following hack will get the global memory + * broker that can be use it to access the pool. + * + * Note there will be memory allocation at least on first innovation + * as there will not be any pool created for this size. + * Subsequent invocations will just reuse the buffer from the pool. + */ +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_sgemm_small: acquiring mem pool block of size %lu\n", (F_SCRATCH_DIM * sizeof(float))); +#endif + bli_thread_init_rntm( &rntm ); + bli_rntm_set_num_threads_only( 1, &rntm ); + bli_membrk_rntm_set_membrk( &rntm ); + + // Get the buffer from the pool, if there is no pool with + // required size, it will be created. + bli_membrk_acquire_m(&rntm, + (F_SCRATCH_DIM * sizeof(float)), + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &local_mem_buf_A_s + ); + + A_pack = bli_mem_buffer(&local_mem_buf_A_s); + /* * The computation loop runs for MRxN columns of C matrix, thus * accessing the MRxK A matrix data and KxNR B matrix data. @@ -1555,6 +1591,18 @@ static err_t bli_sgemm_small } } } + + // Return the buffer to pool + if (bli_mem_is_alloc( &local_mem_buf_A_s) ) { + +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_sgemm_small(): releasing mem pool block\n" ); +#endif + bli_membrk_release( &rntm, + &local_mem_buf_A_s + ); + } + return BLIS_SUCCESS; } else @@ -1617,7 +1665,10 @@ static err_t bli_dgemm_small alpha_cast = (alpha->buffer); beta_cast = (beta->buffer); gint_t required_packing_A = 1; - + mem_t local_mem_buf_A_s; + double *D_A_pack = 0; + rntm_t rntm; + // when N is equal to 1 call GEMV instead of GEMM if (N == 1) { @@ -1643,7 +1694,44 @@ static err_t bli_dgemm_small { required_packing_A = 0; } - /* + + /* + * This function was using global array to pack part of A input when needed. + * However, using this global array make the function non-reentrant. + * Instead of using a global array we should allocate buffer for each invocation. + * Since the buffer size is too big or stack and doing malloc every time will be too expensive, + * better approach is to get the buffer from the pre-allocated pool and return + * it the pool once we are doing. + * + * In order to get the buffer from pool, we need access to memory broker, + * currently this function is not invoked in such a way that it can receive + * the memory broker (via rntm). Following hack will get the global memory + * broker that can be use it to access the pool. + * + * Note there will be memory allocation at least on first innovation + * as there will not be any pool created for this size. + * Subsequent invocations will just reuse the buffer from the pool. + */ + +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemm_small: acquiring mem pool block of size %lu\n", (D_SCRATCH_DIM * sizeof(double))); +#endif + + bli_thread_init_rntm( &rntm ); + bli_rntm_set_num_threads_only( 1, &rntm ); + bli_membrk_rntm_set_membrk( &rntm ); + + // Get the buffer from the pool, if there is no pool with + // required size, it will be created. + bli_membrk_acquire_m( &rntm, + (D_SCRATCH_DIM * sizeof(double)), + BLIS_BITVAL_BUFFER_FOR_A_BLOCK, + &local_mem_buf_A_s + ); + + D_A_pack = bli_mem_buffer(&local_mem_buf_A_s); + + /* * The computation loop runs for D_MRxN columns of C matrix, thus * accessing the D_MRxK A matrix data and KxNR B matrix data. * The computation is organized as inner loops of dimension D_MRxNR. @@ -2963,6 +3051,17 @@ static err_t bli_dgemm_small } } } + + // Return the buffer to pool + if (bli_mem_is_alloc( &local_mem_buf_A_s )) { +#ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemm_small(): releasing mem pool block\n" ); +#endif + bli_membrk_release( &rntm, + &local_mem_buf_A_s + ); + } + return BLIS_SUCCESS; } else