Merge " BLIS : DGEMV performance improvement for incy/incx greater than 1" into amd-staging-milan-3.1

This commit is contained in:
Manideep Kurumella
2021-07-21 10:23:47 -04:00
committed by Gerrit Code Review
2 changed files with 270 additions and 146 deletions

View File

@@ -34,6 +34,7 @@
*/
#include "blis.h"
#define BLIS_DGEMV_VAR1_FUSE 8
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
@@ -53,55 +54,55 @@ void PASTEMAC(ch,varname) \
) \
{ \
\
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
\
const num_t dt = PASTEMAC(ch,type); \
const num_t dt = PASTEMAC(ch,type); \
\
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
ctype* A1; \
ctype* x1; \
ctype* y1; \
dim_t i; \
dim_t b_fuse, f; \
dim_t n_elem, n_iter; \
inc_t rs_at, cs_at; \
conj_t conja; \
\
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_iter, &n_elem, &rs_at, &cs_at ); \
bli_set_dims_incs_with_trans( transa, \
m, n, rs_a, cs_a, \
&n_iter, &n_elem, &rs_at, &cs_at ); \
\
conja = bli_extract_conj( transa ); \
conja = bli_extract_conj( transa ); \
\
PASTECH(ch,dotxf_ker_ft) kfp_df; \
PASTECH(ch,dotxf_ker_ft) kfp_df; \
\
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
/* Query the context for the kernel function pointer and fusing factor. */ \
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
\
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
for ( i = 0; i < n_iter; i += f ) \
{ \
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
\
A1 = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (0 )*incy; \
y1 = y + (i )*incy; \
A1 = a + (i )*rs_at + (0 )*cs_at; \
x1 = x + (0 )*incy; \
y1 = y + (i )*incy; \
\
/* y1 = beta * y1 + alpha * A1 * x; */ \
kfp_df \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, cs_at, rs_at, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
/* y1 = beta * y1 + alpha * A1 * x; */ \
kfp_df \
( \
conja, \
conjx, \
n_elem, \
f, \
alpha, \
A1, cs_at, rs_at, \
x1, incx, \
beta, \
y1, incy, \
cntx \
); \
\
} \
} \
}
#ifdef BLIS_CONFIG_EPYC
@@ -116,57 +117,114 @@ void bli_dgemv_unf_var1
double* x, inc_t incx,
double* beta,
double* y, inc_t incy,
cntx_t* cntx
cntx_t* cntx
)
{
double* A1;
double* x1;
double* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
double* A1;
double* y1;
dim_t i;
dim_t f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
//memory pool declarations for packing vector X.
mem_t mem_bufX;
rntm_t rntm;
double *x_buf = x;
inc_t buf_incx = incx;
bli_init_once();
bli_init_once();
if( cntx == NULL ) cntx = bli_gks_query_cntx();
if( cntx == NULL ) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at );
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
conja = bli_extract_conj( transa );
if (incx > 1)
{
/*
Initialize mem pool buffer to NULL and size to 0
"buf" and "size" fields are assigned once memory
is allocated from the pool in bli_membrk_acquire_m().
This will ensure bli_mem_is_alloc() will be passed on
an allocated memory if created or a NULL .
*/
mem_bufX.pblk.buf = NULL; mem_bufX.pblk.block_size = 0;
mem_bufX.buf_type = 0; mem_bufX.size = 0;
mem_bufX.pool = NULL;
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 8;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
/* In order to get the buffer from pool via rntm access to memory broker
is needed.Following are initializations for rntm */
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
bli_rntm_init_from_global( &rntm );
bli_rntm_set_num_threads_only( 1, &rntm );
bli_membrk_rntm_set_membrk( &rntm );
/* y1 = beta * y1 + alpha * A1 * x; */
bli_ddotxf_zen_int_8
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
//calculate the size required for n_elem double elements in vector X.
size_t buffer_size = n_elem * sizeof(double);
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var1(): get mem pool block\n" );
#endif
/*acquire a Buffer(n_elem*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufX.*/
bli_membrk_acquire_m(&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufX);
/*Continue packing X if buffer memory is allocated*/
if ((bli_mem_is_alloc( &mem_bufX )))
{
x_buf = bli_mem_buffer(&mem_bufX);
//pack X vector with non-unit stride to a temp buffer x_buf with unit stride
for(dim_t x_index = 0 ; x_index < n_elem ; x_index++)
{
*(x_buf + x_index) = *(x + (x_index * incx)) ;
}
// stride of vector x_buf =1
buf_incx = 1;
}
}
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR1_FUSE );
A1 = a + (i )*rs_at + (0 )*cs_at;
y1 = y + (i )*incy;
/* y1 = beta * y1 + alpha * A1 * x; */
bli_ddotxf_zen_int_8
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x_buf, buf_incx,
beta,
y1, incy,
cntx
);
}
if ((incx > 1) && bli_mem_is_alloc( &mem_bufX ))
{
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var1(): releasing mem pool block\n" );
#endif
// Return the buffer to pool
bli_membrk_release(&rntm , &mem_bufX);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
void bli_sgemv_unf_var1
@@ -180,57 +238,57 @@ void bli_sgemv_unf_var1
float* x, inc_t incx,
float* beta,
float* y, inc_t incy,
cntx_t* cntx
cntx_t* cntx
)
{
float* A1;
float* x1;
float* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
float* A1;
float* x1;
float* y1;
dim_t i;
dim_t b_fuse, f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
bli_init_once();
bli_init_once();
if( cntx == NULL ) cntx = bli_gks_query_cntx();
if( cntx == NULL ) cntx = bli_gks_query_cntx();
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at );
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
&n_iter, &n_elem, &rs_at, &cs_at );
conja = bli_extract_conj( transa );
conja = bli_extract_conj( transa );
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 8;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
/* Query the context for the kernel function pointer and fusing factor. */
b_fuse = 8;
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
/* y1 = beta * y1 + alpha * A1 * x; */
bli_sdotxf_zen_int_8
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
A1 = a + (i )*rs_at + (0 )*cs_at;
x1 = x + (0 )*incy;
y1 = y + (i )*incy;
}
/* y1 = beta * y1 + alpha * A1 * x; */
bli_sdotxf_zen_int_8
(
conja,
conjx,
n_elem,
f,
alpha,
A1, cs_at, rs_at,
x1, incx,
beta,
y1, incy,
cntx
);
}
}
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )

View File

@@ -34,6 +34,7 @@
*/
#include "blis.h"
#define BLIS_DGEMV_VAR2_FUSE 4
#undef GENTFUNC
#define GENTFUNC( ctype, ch, varname ) \
@@ -156,12 +157,16 @@ void bli_dgemv_unf_var2
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
double* A1;
double* x1;
double* y1;
dim_t i;
dim_t b_fuse, f;
dim_t f;
dim_t n_elem, n_iter;
inc_t rs_at, cs_at;
conj_t conja;
//memory pool declarations for packing vector Y.
mem_t mem_bufY;
rntm_t rntm;
double *y_buf = y;
inc_t buf_incy = incy;
bli_set_dims_incs_with_trans( transa,
m, n, rs_a, cs_a,
@@ -173,30 +178,76 @@ void bli_dgemv_unf_var2
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
bli_dscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
NULL
);
bli_dscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
NULL
);
if( bli_deq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
if (incy > 1)
{
/*
Initialize mem pool buffer to NULL and size to 0
"buf" and "size" fields are assigned once memory
is allocated from the pool in bli_membrk_acquire_m().
This will ensure bli_mem_is_alloc() will be passed on
an allocated memory if created or a NULL .
*/
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
mem_bufY.buf_type = 0; mem_bufY.size = 0;
mem_bufY.pool = NULL;
/* In order to get the buffer from pool via rntm access to memory broker
is needed.Following are initializations for rntm */
bli_rntm_init_from_global( &rntm );
bli_rntm_set_num_threads_only( 1, &rntm );
bli_membrk_rntm_set_membrk( &rntm );
//calculate the size required for n_elem double elements in vector Y.
size_t buffer_size = n_elem * sizeof(double);
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
#endif
/*acquire a Buffer(n_elem*size(double)) from the memory broker
and save the associated mem_t entry to mem_bufY.*/
bli_membrk_acquire_m(&rntm,
buffer_size,
BLIS_BUFFER_FOR_B_PANEL,
&mem_bufY);
/*Continue packing Y if buffer memory is allocated*/
if ((bli_mem_is_alloc( &mem_bufY )))
{
y_buf = bli_mem_buffer(&mem_bufY);
//pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
{
*(y_buf + y_index) = *(y + (y_index * incy)) ;
}
// stride of vector y_buf =1
buf_incy = 1;
}
}
/* Fusing factor. */
b_fuse = 4;
for ( i = 0; i < n_iter; i += f )
{
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
A1 = a + (0 )*rs_at + (i )*cs_at;
x1 = x + (i )*incx;
y1 = y + (0 )*incy;
/* y = y + alpha * A1 * x1; */
bli_daxpyf_zen_int_16x4
@@ -208,10 +259,25 @@ void bli_dgemv_unf_var2
alpha,
A1, rs_at, cs_at,
x1, incx,
y1, incy,
y_buf, buf_incy,
NULL
);
}
if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
{
//store the result from unit strided y_buf to non-unit strided Y
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
{
*(y + (y_index * incy)) = *(y_buf + y_index) ;
}
#ifdef BLIS_ENABLE_MEM_TRACING
printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
#endif
// Return the buffer to pool
bli_membrk_release(&rntm , &mem_bufY);
}
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
}
@@ -250,19 +316,19 @@ void bli_sgemv_unf_var2
/* y = beta * y; */
/* beta=0 case is hadled by scalv internally */
bli_sscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
NULL
);
bli_sscalv_zen_int10
(
BLIS_NO_CONJUGATE,
n_elem,
beta,
y, incy,
NULL
);
if( bli_seq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
return;
}
/* Query the context for the kernel function pointer and fusing factor. */
@@ -342,7 +408,7 @@ void bli_zgemv_unf_var2
if( bli_zeq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
return;
return;
}
/* fusing factor */
@@ -418,8 +484,8 @@ void bli_cgemv_unf_var2
if( bli_ceq0( *alpha ) )
{
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
return;
}
/* fusing factor. */
b_fuse = 4;