diff --git a/frame/2/gemv/bli_gemv_unf_var1.c b/frame/2/gemv/bli_gemv_unf_var1.c index 25496f4bd..7f65f7168 100644 --- a/frame/2/gemv/bli_gemv_unf_var1.c +++ b/frame/2/gemv/bli_gemv_unf_var1.c @@ -34,6 +34,7 @@ */ #include "blis.h" +#define BLIS_DGEMV_VAR1_FUSE 8 #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ @@ -53,55 +54,55 @@ void PASTEMAC(ch,varname) \ ) \ { \ \ - if(cntx == NULL) cntx = bli_gks_query_cntx(); \ + if(cntx == NULL) cntx = bli_gks_query_cntx(); \ \ - const num_t dt = PASTEMAC(ch,type); \ + const num_t dt = PASTEMAC(ch,type); \ \ - ctype* A1; \ - ctype* x1; \ - ctype* y1; \ - dim_t i; \ - dim_t b_fuse, f; \ - dim_t n_elem, n_iter; \ - inc_t rs_at, cs_at; \ - conj_t conja; \ + ctype* A1; \ + ctype* x1; \ + ctype* y1; \ + dim_t i; \ + dim_t b_fuse, f; \ + dim_t n_elem, n_iter; \ + inc_t rs_at, cs_at; \ + conj_t conja; \ \ - bli_set_dims_incs_with_trans( transa, \ - m, n, rs_a, cs_a, \ - &n_iter, &n_elem, &rs_at, &cs_at ); \ + bli_set_dims_incs_with_trans( transa, \ + m, n, rs_a, cs_a, \ + &n_iter, &n_elem, &rs_at, &cs_at ); \ \ - conja = bli_extract_conj( transa ); \ + conja = bli_extract_conj( transa ); \ \ - PASTECH(ch,dotxf_ker_ft) kfp_df; \ + PASTECH(ch,dotxf_ker_ft) kfp_df; \ \ - /* Query the context for the kernel function pointer and fusing factor. */ \ - kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ - b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ + /* Query the context for the kernel function pointer and fusing factor. */ \ + kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \ + b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \ \ - for ( i = 0; i < n_iter; i += f ) \ - { \ - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ + for ( i = 0; i < n_iter; i += f ) \ + { \ + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \ \ - A1 = a + (i )*rs_at + (0 )*cs_at; \ - x1 = x + (0 )*incy; \ - y1 = y + (i )*incy; \ + A1 = a + (i )*rs_at + (0 )*cs_at; \ + x1 = x + (0 )*incy; \ + y1 = y + (i )*incy; \ \ - /* y1 = beta * y1 + alpha * A1 * x; */ \ - kfp_df \ - ( \ - conja, \ - conjx, \ - n_elem, \ - f, \ - alpha, \ - A1, cs_at, rs_at, \ - x1, incx, \ - beta, \ - y1, incy, \ - cntx \ - ); \ + /* y1 = beta * y1 + alpha * A1 * x; */ \ + kfp_df \ + ( \ + conja, \ + conjx, \ + n_elem, \ + f, \ + alpha, \ + A1, cs_at, rs_at, \ + x1, incx, \ + beta, \ + y1, incy, \ + cntx \ + ); \ \ - } \ + } \ } #ifdef BLIS_CONFIG_EPYC @@ -116,57 +117,114 @@ void bli_dgemv_unf_var1 double* x, inc_t incx, double* beta, double* y, inc_t incy, - cntx_t* cntx + cntx_t* cntx ) { - double* A1; - double* x1; - double* y1; - dim_t i; - dim_t b_fuse, f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; + double* A1; + double* y1; + dim_t i; + dim_t f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; + //memory pool declarations for packing vector X. + mem_t mem_bufX; + rntm_t rntm; + double *x_buf = x; + inc_t buf_incx = incx; - bli_init_once(); + bli_init_once(); - if( cntx == NULL ) cntx = bli_gks_query_cntx(); + if( cntx == NULL ) cntx = bli_gks_query_cntx(); - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_iter, &n_elem, &rs_at, &cs_at ); + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_iter, &n_elem, &rs_at, &cs_at ); - conja = bli_extract_conj( transa ); + conja = bli_extract_conj( transa ); + if (incx > 1) + { + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + mem_bufX.pblk.buf = NULL; mem_bufX.pblk.block_size = 0; + mem_bufX.buf_type = 0; mem_bufX.size = 0; + mem_bufX.pool = NULL; - /* Query the context for the kernel function pointer and fusing factor. */ - b_fuse = 8; - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + /* In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm */ - A1 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (0 )*incy; - y1 = y + (i )*incy; + bli_rntm_init_from_global( &rntm ); + bli_rntm_set_num_threads_only( 1, &rntm ); + bli_membrk_rntm_set_membrk( &rntm ); - /* y1 = beta * y1 + alpha * A1 * x; */ - bli_ddotxf_zen_int_8 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x1, incx, - beta, - y1, incy, - cntx - ); + //calculate the size required for n_elem double elements in vector X. + size_t buffer_size = n_elem * sizeof(double); - } + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemv_unf_var1(): get mem pool block\n" ); + #endif + + /*acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufX.*/ + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_bufX); + + /*Continue packing X if buffer memory is allocated*/ + if ((bli_mem_is_alloc( &mem_bufX ))) + { + x_buf = bli_mem_buffer(&mem_bufX); + + //pack X vector with non-unit stride to a temp buffer x_buf with unit stride + for(dim_t x_index = 0 ; x_index < n_elem ; x_index++) + { + *(x_buf + x_index) = *(x + (x_index * incx)) ; + } + // stride of vector x_buf =1 + buf_incx = 1; + } + } + + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR1_FUSE ); + + A1 = a + (i )*rs_at + (0 )*cs_at; + y1 = y + (i )*incy; + + /* y1 = beta * y1 + alpha * A1 * x; */ + bli_ddotxf_zen_int_8 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x_buf, buf_incx, + beta, + y1, incy, + cntx + ); + + } + if ((incx > 1) && bli_mem_is_alloc( &mem_bufX )) + { + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemv_unf_var1(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool + bli_membrk_release(&rntm , &mem_bufX); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); } void bli_sgemv_unf_var1 @@ -180,57 +238,57 @@ void bli_sgemv_unf_var1 float* x, inc_t incx, float* beta, float* y, inc_t incy, - cntx_t* cntx + cntx_t* cntx ) { - float* A1; - float* x1; - float* y1; - dim_t i; - dim_t b_fuse, f; - dim_t n_elem, n_iter; - inc_t rs_at, cs_at; - conj_t conja; + float* A1; + float* x1; + float* y1; + dim_t i; + dim_t b_fuse, f; + dim_t n_elem, n_iter; + inc_t rs_at, cs_at; + conj_t conja; - bli_init_once(); + bli_init_once(); - if( cntx == NULL ) cntx = bli_gks_query_cntx(); + if( cntx == NULL ) cntx = bli_gks_query_cntx(); - bli_set_dims_incs_with_trans( transa, - m, n, rs_a, cs_a, - &n_iter, &n_elem, &rs_at, &cs_at ); + bli_set_dims_incs_with_trans( transa, + m, n, rs_a, cs_a, + &n_iter, &n_elem, &rs_at, &cs_at ); - conja = bli_extract_conj( transa ); + conja = bli_extract_conj( transa ); - /* Query the context for the kernel function pointer and fusing factor. */ - b_fuse = 8; - - for ( i = 0; i < n_iter; i += f ) - { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + /* Query the context for the kernel function pointer and fusing factor. */ + b_fuse = 8; - A1 = a + (i )*rs_at + (0 )*cs_at; - x1 = x + (0 )*incy; - y1 = y + (i )*incy; + for ( i = 0; i < n_iter; i += f ) + { + f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); - /* y1 = beta * y1 + alpha * A1 * x; */ - bli_sdotxf_zen_int_8 - ( - conja, - conjx, - n_elem, - f, - alpha, - A1, cs_at, rs_at, - x1, incx, - beta, - y1, incy, - cntx - ); + A1 = a + (i )*rs_at + (0 )*cs_at; + x1 = x + (0 )*incy; + y1 = y + (i )*incy; - } + /* y1 = beta * y1 + alpha * A1 * x; */ + bli_sdotxf_zen_int_8 + ( + conja, + conjx, + n_elem, + f, + alpha, + A1, cs_at, rs_at, + x1, incx, + beta, + y1, incy, + cntx + ); + + } } INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 ) diff --git a/frame/2/gemv/bli_gemv_unf_var2.c b/frame/2/gemv/bli_gemv_unf_var2.c index 790f0bd9e..cb77e3073 100644 --- a/frame/2/gemv/bli_gemv_unf_var2.c +++ b/frame/2/gemv/bli_gemv_unf_var2.c @@ -34,6 +34,7 @@ */ #include "blis.h" +#define BLIS_DGEMV_VAR2_FUSE 4 #undef GENTFUNC #define GENTFUNC( ctype, ch, varname ) \ @@ -156,12 +157,16 @@ void bli_dgemv_unf_var2 AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3); double* A1; double* x1; - double* y1; dim_t i; - dim_t b_fuse, f; + dim_t f; dim_t n_elem, n_iter; inc_t rs_at, cs_at; conj_t conja; + //memory pool declarations for packing vector Y. + mem_t mem_bufY; + rntm_t rntm; + double *y_buf = y; + inc_t buf_incy = incy; bli_set_dims_incs_with_trans( transa, m, n, rs_a, cs_a, @@ -173,30 +178,76 @@ void bli_dgemv_unf_var2 /* y = beta * y; */ /* beta=0 case is hadled by scalv internally */ - bli_dscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - NULL - ); + bli_dscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + NULL + ); if( bli_deq0( *alpha ) ) { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) - return; + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) + return; + } + + if (incy > 1) + { + /* + Initialize mem pool buffer to NULL and size to 0 + "buf" and "size" fields are assigned once memory + is allocated from the pool in bli_membrk_acquire_m(). + This will ensure bli_mem_is_alloc() will be passed on + an allocated memory if created or a NULL . + */ + mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0; + mem_bufY.buf_type = 0; mem_bufY.size = 0; + mem_bufY.pool = NULL; + + /* In order to get the buffer from pool via rntm access to memory broker + is needed.Following are initializations for rntm */ + + bli_rntm_init_from_global( &rntm ); + bli_rntm_set_num_threads_only( 1, &rntm ); + bli_membrk_rntm_set_membrk( &rntm ); + + //calculate the size required for n_elem double elements in vector Y. + size_t buffer_size = n_elem * sizeof(double); + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemv_unf_var2(): get mem pool block\n" ); + #endif + + /*acquire a Buffer(n_elem*size(double)) from the memory broker + and save the associated mem_t entry to mem_bufY.*/ + bli_membrk_acquire_m(&rntm, + buffer_size, + BLIS_BUFFER_FOR_B_PANEL, + &mem_bufY); + + /*Continue packing Y if buffer memory is allocated*/ + if ((bli_mem_is_alloc( &mem_bufY ))) + { + y_buf = bli_mem_buffer(&mem_bufY); + + //pack Y vector with non-unit stride to a temp buffer y_buf with unit stride + for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) + { + *(y_buf + y_index) = *(y + (y_index * incy)) ; + } + // stride of vector y_buf =1 + buf_incy = 1; + } } - /* Fusing factor. */ - b_fuse = 4; for ( i = 0; i < n_iter; i += f ) { - f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); + f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE ); A1 = a + (0 )*rs_at + (i )*cs_at; x1 = x + (i )*incx; - y1 = y + (0 )*incy; /* y = y + alpha * A1 * x1; */ bli_daxpyf_zen_int_16x4 @@ -208,10 +259,25 @@ void bli_dgemv_unf_var2 alpha, A1, rs_at, cs_at, x1, incx, - y1, incy, + y_buf, buf_incy, NULL ); } + if ((incy > 1) && bli_mem_is_alloc( &mem_bufY )) + { + //store the result from unit strided y_buf to non-unit strided Y + for(dim_t y_index = 0 ; y_index < n_elem ; y_index++) + { + *(y + (y_index * incy)) = *(y_buf + y_index) ; + } + + #ifdef BLIS_ENABLE_MEM_TRACING + printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" ); + #endif + // Return the buffer to pool + bli_membrk_release(&rntm , &mem_bufY); + } + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); } @@ -250,19 +316,19 @@ void bli_sgemv_unf_var2 /* y = beta * y; */ /* beta=0 case is hadled by scalv internally */ - bli_sscalv_zen_int10 - ( - BLIS_NO_CONJUGATE, - n_elem, - beta, - y, incy, - NULL - ); + bli_sscalv_zen_int10 + ( + BLIS_NO_CONJUGATE, + n_elem, + beta, + y, incy, + NULL + ); if( bli_seq0( *alpha ) ) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) - return; + return; } /* Query the context for the kernel function pointer and fusing factor. */ @@ -342,7 +408,7 @@ void bli_zgemv_unf_var2 if( bli_zeq0( *alpha ) ) { AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3); - return; + return; } /* fusing factor */ @@ -418,8 +484,8 @@ void bli_cgemv_unf_var2 if( bli_ceq0( *alpha ) ) { - AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) - return; + AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3) + return; } /* fusing factor. */ b_fuse = 4;