diff --git a/kernels/zen/3/bli_gemm_small.c b/kernels/zen/3/bli_gemm_small.c
index 1db3b59ea..85d097247 100644
--- a/kernels/zen/3/bli_gemm_small.c
+++ b/kernels/zen/3/bli_gemm_small.c
@@ -44,12 +44,10 @@
 
 #define BLIS_ENABLE_PREFETCH
 #define F_SCRATCH_DIM (BLIS_SMALL_MATRIX_THRES * BLIS_SMALL_MATRIX_THRES)
-static float A_pack[F_SCRATCH_DIM]  __attribute__((aligned(64)));
 #define D_BLIS_SMALL_MATRIX_THRES (BLIS_SMALL_MATRIX_THRES / 2 )
 #define D_BLIS_SMALL_M_RECT_MATRIX_THRES (BLIS_SMALL_M_RECT_MATRIX_THRES / 2)
 #define D_BLIS_SMALL_K_RECT_MATRIX_THRES (BLIS_SMALL_K_RECT_MATRIX_THRES / 2)
 #define D_SCRATCH_DIM (D_BLIS_SMALL_MATRIX_THRES * D_BLIS_SMALL_MATRIX_THRES)
-static double D_A_pack[D_SCRATCH_DIM]  __attribute__((aligned(64)));
 #define BLIS_ATBN_M_THRES 40 // Threshold value of M for/below which small matrix code is called. 
 #define AT_MR 4 // The kernel dimension of the A transpose GEMM kernel.(AT_MR * NR).
 static err_t bli_sgemm_small
@@ -210,7 +208,10 @@ static err_t bli_sgemm_small
         alpha_cast = (alpha->buffer);
         beta_cast = (beta->buffer);
         gint_t required_packing_A = 1;
-
+	mem_t local_mem_buf_A_s;
+	float *A_pack = 0;
+	rntm_t rntm;
+	
         // when N is equal to 1 call GEMV instead of GEMM
         if (N == 1)
         {
@@ -236,6 +237,41 @@ static err_t bli_sgemm_small
         {
             required_packing_A = 0;
         }
+
+	/*
+	 * This function was using global array to pack part of A input when needed.
+	 * However, using this global array make the function non-reentrant.
+	 * Instead of using a global array we should allocate buffer for each invocation.
+	 * Since the buffer size is too big or stack and doing malloc every time will be too expensive,
+	 * better approach is to get the buffer from the pre-allocated pool and return
+	 * it the pool once we are doing.
+         *
+	 * In order to get the buffer from pool, we need access to memory broker,
+	 * currently this function is not invoked in such a way that it can receive
+	 * the memory broker (via rntm). Following hack will get the global memory
+	 * broker that can be use it to access the pool.
+	 *
+	 * Note there will be memory allocation at least on first innovation
+	 * as there will not be any pool created for this size.
+	 * Subsequent invocations will just reuse the buffer from the pool.
+	 */
+#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_sgemm_small: acquiring mem pool block of size %lu\n", (F_SCRATCH_DIM * sizeof(float)));
+#endif	
+	bli_thread_init_rntm( &rntm );
+	bli_rntm_set_num_threads_only( 1, &rntm );
+	bli_membrk_rntm_set_membrk( &rntm );
+
+	// Get the buffer from the pool, if there is no pool with
+	// required size, it will be created. 
+	bli_membrk_acquire_m(&rntm,
+			     (F_SCRATCH_DIM * sizeof(float)),
+			     BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
+			     &local_mem_buf_A_s
+			     );
+
+	A_pack = bli_mem_buffer(&local_mem_buf_A_s);
+
         /*
         * The computation loop runs for MRxN columns of C matrix, thus
         * accessing the MRxK A matrix data and KxNR B matrix data.
@@ -1555,6 +1591,18 @@ static err_t bli_sgemm_small
                 }
             }
         }
+
+	// Return the buffer to pool
+	if (bli_mem_is_alloc( &local_mem_buf_A_s) ) {
+
+#ifdef BLIS_ENABLE_MEM_TRACING
+	  printf( "bli_sgemm_small(): releasing mem pool block\n" );
+#endif
+	  bli_membrk_release( &rntm,
+			      &local_mem_buf_A_s
+			      );
+	}
+
         return BLIS_SUCCESS;
     }
     else
@@ -1617,7 +1665,10 @@ static err_t bli_dgemm_small
         alpha_cast = (alpha->buffer);
         beta_cast = (beta->buffer);
         gint_t required_packing_A = 1;
-
+	mem_t local_mem_buf_A_s;
+	double *D_A_pack = 0;
+	rntm_t rntm;
+	
         // when N is equal to 1 call GEMV instead of GEMM
         if (N == 1)
         {
@@ -1643,7 +1694,44 @@ static err_t bli_dgemm_small
         {
             required_packing_A = 0;
         }
-        /*
+
+	/*
+	 * This function was using global array to pack part of A input when needed.
+	 * However, using this global array make the function non-reentrant.
+	 * Instead of using a global array we should allocate buffer for each invocation.
+	 * Since the buffer size is too big or stack and doing malloc every time will be too expensive,
+	 * better approach is to get the buffer from the pre-allocated pool and return
+	 * it the pool once we are doing.
+         *
+	 * In order to get the buffer from pool, we need access to memory broker,
+	 * currently this function is not invoked in such a way that it can receive
+	 * the memory broker (via rntm). Following hack will get the global memory
+	 * broker that can be use it to access the pool.
+	 *
+	 * Note there will be memory allocation at least on first innovation
+	 * as there will not be any pool created for this size.
+	 * Subsequent invocations will just reuse the buffer from the pool.
+	 */
+
+#ifdef BLIS_ENABLE_MEM_TRACING
+	printf( "bli_dgemm_small: acquiring mem pool block of size %lu\n", (D_SCRATCH_DIM * sizeof(double)));
+#endif
+
+  	bli_thread_init_rntm( &rntm );
+	bli_rntm_set_num_threads_only( 1, &rntm );
+	bli_membrk_rntm_set_membrk( &rntm );
+
+	// Get the buffer from the pool, if there is no pool with
+	// required size, it will be created. 
+	bli_membrk_acquire_m( &rntm,
+			      (D_SCRATCH_DIM * sizeof(double)),
+			      BLIS_BITVAL_BUFFER_FOR_A_BLOCK,
+			      &local_mem_buf_A_s
+			      );
+
+	D_A_pack = bli_mem_buffer(&local_mem_buf_A_s);
+
+	/*
         * The computation loop runs for D_MRxN columns of C matrix, thus
         * accessing the D_MRxK A matrix data and KxNR B matrix data.
         * The computation is organized as inner loops of dimension D_MRxNR.
@@ -2963,6 +3051,17 @@ static err_t bli_dgemm_small
                 }
             }
         }
+
+	// Return the buffer to pool
+	if (bli_mem_is_alloc( &local_mem_buf_A_s )) {
+#ifdef BLIS_ENABLE_MEM_TRACING
+	  printf( "bli_dgemm_small(): releasing mem pool block\n" );
+#endif
+	  bli_membrk_release( &rntm,
+			      &local_mem_buf_A_s
+			      );
+	}
+
         return BLIS_SUCCESS;
     }
     else