mirror of
https://github.com/amd/blis.git
synced 2026-05-11 17:50:00 +00:00
BLIS : DGEMV performance improvement for incy/incx greater than 1
Details :
- Added packing Of Y for incy >1 cases for dgemv_unf_var2.
- Added packing Of X for incx >1 cases for dgemv_unf_var1.
AMD-Internal: [SWLCSG-735]
Change-Id: Ib395f478ba984a85533e4f79b3521d0b2500c30c
This commit is contained in:
@@ -34,6 +34,7 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#define BLIS_DGEMV_VAR1_FUSE 8
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
@@ -53,55 +54,55 @@ void PASTEMAC(ch,varname) \
|
||||
) \
|
||||
{ \
|
||||
\
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
|
||||
if(cntx == NULL) cntx = bli_gks_query_cntx(); \
|
||||
\
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
const num_t dt = PASTEMAC(ch,type); \
|
||||
\
|
||||
ctype* A1; \
|
||||
ctype* x1; \
|
||||
ctype* y1; \
|
||||
dim_t i; \
|
||||
dim_t b_fuse, f; \
|
||||
dim_t n_elem, n_iter; \
|
||||
inc_t rs_at, cs_at; \
|
||||
conj_t conja; \
|
||||
ctype* A1; \
|
||||
ctype* x1; \
|
||||
ctype* y1; \
|
||||
dim_t i; \
|
||||
dim_t b_fuse, f; \
|
||||
dim_t n_elem, n_iter; \
|
||||
inc_t rs_at, cs_at; \
|
||||
conj_t conja; \
|
||||
\
|
||||
bli_set_dims_incs_with_trans( transa, \
|
||||
m, n, rs_a, cs_a, \
|
||||
&n_iter, &n_elem, &rs_at, &cs_at ); \
|
||||
bli_set_dims_incs_with_trans( transa, \
|
||||
m, n, rs_a, cs_a, \
|
||||
&n_iter, &n_elem, &rs_at, &cs_at ); \
|
||||
\
|
||||
conja = bli_extract_conj( transa ); \
|
||||
conja = bli_extract_conj( transa ); \
|
||||
\
|
||||
PASTECH(ch,dotxf_ker_ft) kfp_df; \
|
||||
PASTECH(ch,dotxf_ker_ft) kfp_df; \
|
||||
\
|
||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
|
||||
/* Query the context for the kernel function pointer and fusing factor. */ \
|
||||
kfp_df = bli_cntx_get_l1f_ker_dt( dt, BLIS_DOTXF_KER, cntx ); \
|
||||
b_fuse = bli_cntx_get_blksz_def_dt( dt, BLIS_DF, cntx ); \
|
||||
\
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
|
||||
for ( i = 0; i < n_iter; i += f ) \
|
||||
{ \
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse ); \
|
||||
\
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at; \
|
||||
x1 = x + (0 )*incy; \
|
||||
y1 = y + (i )*incy; \
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at; \
|
||||
x1 = x + (0 )*incy; \
|
||||
y1 = y + (i )*incy; \
|
||||
\
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */ \
|
||||
kfp_df \
|
||||
( \
|
||||
conja, \
|
||||
conjx, \
|
||||
n_elem, \
|
||||
f, \
|
||||
alpha, \
|
||||
A1, cs_at, rs_at, \
|
||||
x1, incx, \
|
||||
beta, \
|
||||
y1, incy, \
|
||||
cntx \
|
||||
); \
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */ \
|
||||
kfp_df \
|
||||
( \
|
||||
conja, \
|
||||
conjx, \
|
||||
n_elem, \
|
||||
f, \
|
||||
alpha, \
|
||||
A1, cs_at, rs_at, \
|
||||
x1, incx, \
|
||||
beta, \
|
||||
y1, incy, \
|
||||
cntx \
|
||||
); \
|
||||
\
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
#ifdef BLIS_CONFIG_EPYC
|
||||
@@ -116,57 +117,114 @@ void bli_dgemv_unf_var1
|
||||
double* x, inc_t incx,
|
||||
double* beta,
|
||||
double* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
double* A1;
|
||||
double* x1;
|
||||
double* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
double* A1;
|
||||
double* y1;
|
||||
dim_t i;
|
||||
dim_t f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
//memory pool declarations for packing vector X.
|
||||
mem_t mem_bufX;
|
||||
rntm_t rntm;
|
||||
double *x_buf = x;
|
||||
inc_t buf_incx = incx;
|
||||
|
||||
bli_init_once();
|
||||
bli_init_once();
|
||||
|
||||
if( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
if( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at );
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
if (incx > 1)
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
"buf" and "size" fields are assigned once memory
|
||||
is allocated from the pool in bli_membrk_acquire_m().
|
||||
This will ensure bli_mem_is_alloc() will be passed on
|
||||
an allocated memory if created or a NULL .
|
||||
*/
|
||||
mem_bufX.pblk.buf = NULL; mem_bufX.pblk.block_size = 0;
|
||||
mem_bufX.buf_type = 0; mem_bufX.size = 0;
|
||||
mem_bufX.pool = NULL;
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 8;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
/* In order to get the buffer from pool via rntm access to memory broker
|
||||
is needed.Following are initializations for rntm */
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
bli_rntm_init_from_global( &rntm );
|
||||
bli_rntm_set_num_threads_only( 1, &rntm );
|
||||
bli_membrk_rntm_set_membrk( &rntm );
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
bli_ddotxf_zen_int_8
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
//calculate the size required for n_elem double elements in vector X.
|
||||
size_t buffer_size = n_elem * sizeof(double);
|
||||
|
||||
}
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var1(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
/*acquire a Buffer(n_elem*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufX.*/
|
||||
bli_membrk_acquire_m(&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufX);
|
||||
|
||||
/*Continue packing X if buffer memory is allocated*/
|
||||
if ((bli_mem_is_alloc( &mem_bufX )))
|
||||
{
|
||||
x_buf = bli_mem_buffer(&mem_bufX);
|
||||
|
||||
//pack X vector with non-unit stride to a temp buffer x_buf with unit stride
|
||||
for(dim_t x_index = 0 ; x_index < n_elem ; x_index++)
|
||||
{
|
||||
*(x_buf + x_index) = *(x + (x_index * incx)) ;
|
||||
}
|
||||
// stride of vector x_buf =1
|
||||
buf_incx = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR1_FUSE );
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
bli_ddotxf_zen_int_8
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x_buf, buf_incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
if ((incx > 1) && bli_mem_is_alloc( &mem_bufX ))
|
||||
{
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var1(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_membrk_release(&rntm , &mem_bufX);
|
||||
}
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
void bli_sgemv_unf_var1
|
||||
@@ -180,57 +238,57 @@ void bli_sgemv_unf_var1
|
||||
float* x, inc_t incx,
|
||||
float* beta,
|
||||
float* y, inc_t incy,
|
||||
cntx_t* cntx
|
||||
cntx_t* cntx
|
||||
)
|
||||
{
|
||||
|
||||
float* A1;
|
||||
float* x1;
|
||||
float* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
float* A1;
|
||||
float* x1;
|
||||
float* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
|
||||
bli_init_once();
|
||||
bli_init_once();
|
||||
|
||||
if( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
if( cntx == NULL ) cntx = bli_gks_query_cntx();
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at );
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
&n_iter, &n_elem, &rs_at, &cs_at );
|
||||
|
||||
conja = bli_extract_conj( transa );
|
||||
conja = bli_extract_conj( transa );
|
||||
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 8;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
b_fuse = 8;
|
||||
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
bli_sdotxf_zen_int_8
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
A1 = a + (i )*rs_at + (0 )*cs_at;
|
||||
x1 = x + (0 )*incy;
|
||||
y1 = y + (i )*incy;
|
||||
|
||||
}
|
||||
/* y1 = beta * y1 + alpha * A1 * x; */
|
||||
bli_sdotxf_zen_int_8
|
||||
(
|
||||
conja,
|
||||
conjx,
|
||||
n_elem,
|
||||
f,
|
||||
alpha,
|
||||
A1, cs_at, rs_at,
|
||||
x1, incx,
|
||||
beta,
|
||||
y1, incy,
|
||||
cntx
|
||||
);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
INSERT_GENTFUNC_BASIC0_CZ( gemv_unf_var1 )
|
||||
|
||||
@@ -34,6 +34,7 @@
|
||||
*/
|
||||
|
||||
#include "blis.h"
|
||||
#define BLIS_DGEMV_VAR2_FUSE 4
|
||||
|
||||
#undef GENTFUNC
|
||||
#define GENTFUNC( ctype, ch, varname ) \
|
||||
@@ -156,12 +157,16 @@ void bli_dgemv_unf_var2
|
||||
AOCL_DTL_TRACE_ENTRY(AOCL_DTL_LEVEL_TRACE_3);
|
||||
double* A1;
|
||||
double* x1;
|
||||
double* y1;
|
||||
dim_t i;
|
||||
dim_t b_fuse, f;
|
||||
dim_t f;
|
||||
dim_t n_elem, n_iter;
|
||||
inc_t rs_at, cs_at;
|
||||
conj_t conja;
|
||||
//memory pool declarations for packing vector Y.
|
||||
mem_t mem_bufY;
|
||||
rntm_t rntm;
|
||||
double *y_buf = y;
|
||||
inc_t buf_incy = incy;
|
||||
|
||||
bli_set_dims_incs_with_trans( transa,
|
||||
m, n, rs_a, cs_a,
|
||||
@@ -173,30 +178,76 @@ void bli_dgemv_unf_var2
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
bli_dscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_deq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
|
||||
if (incy > 1)
|
||||
{
|
||||
/*
|
||||
Initialize mem pool buffer to NULL and size to 0
|
||||
"buf" and "size" fields are assigned once memory
|
||||
is allocated from the pool in bli_membrk_acquire_m().
|
||||
This will ensure bli_mem_is_alloc() will be passed on
|
||||
an allocated memory if created or a NULL .
|
||||
*/
|
||||
mem_bufY.pblk.buf = NULL; mem_bufY.pblk.block_size = 0;
|
||||
mem_bufY.buf_type = 0; mem_bufY.size = 0;
|
||||
mem_bufY.pool = NULL;
|
||||
|
||||
/* In order to get the buffer from pool via rntm access to memory broker
|
||||
is needed.Following are initializations for rntm */
|
||||
|
||||
bli_rntm_init_from_global( &rntm );
|
||||
bli_rntm_set_num_threads_only( 1, &rntm );
|
||||
bli_membrk_rntm_set_membrk( &rntm );
|
||||
|
||||
//calculate the size required for n_elem double elements in vector Y.
|
||||
size_t buffer_size = n_elem * sizeof(double);
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var2(): get mem pool block\n" );
|
||||
#endif
|
||||
|
||||
/*acquire a Buffer(n_elem*size(double)) from the memory broker
|
||||
and save the associated mem_t entry to mem_bufY.*/
|
||||
bli_membrk_acquire_m(&rntm,
|
||||
buffer_size,
|
||||
BLIS_BUFFER_FOR_B_PANEL,
|
||||
&mem_bufY);
|
||||
|
||||
/*Continue packing Y if buffer memory is allocated*/
|
||||
if ((bli_mem_is_alloc( &mem_bufY )))
|
||||
{
|
||||
y_buf = bli_mem_buffer(&mem_bufY);
|
||||
|
||||
//pack Y vector with non-unit stride to a temp buffer y_buf with unit stride
|
||||
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
|
||||
{
|
||||
*(y_buf + y_index) = *(y + (y_index * incy)) ;
|
||||
}
|
||||
// stride of vector y_buf =1
|
||||
buf_incy = 1;
|
||||
}
|
||||
}
|
||||
/* Fusing factor. */
|
||||
b_fuse = 4;
|
||||
|
||||
for ( i = 0; i < n_iter; i += f )
|
||||
{
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, b_fuse );
|
||||
f = bli_determine_blocksize_dim_f( i, n_iter, BLIS_DGEMV_VAR2_FUSE );
|
||||
|
||||
A1 = a + (0 )*rs_at + (i )*cs_at;
|
||||
x1 = x + (i )*incx;
|
||||
y1 = y + (0 )*incy;
|
||||
|
||||
/* y = y + alpha * A1 * x1; */
|
||||
bli_daxpyf_zen_int_16x4
|
||||
@@ -208,10 +259,25 @@ void bli_dgemv_unf_var2
|
||||
alpha,
|
||||
A1, rs_at, cs_at,
|
||||
x1, incx,
|
||||
y1, incy,
|
||||
y_buf, buf_incy,
|
||||
NULL
|
||||
);
|
||||
}
|
||||
if ((incy > 1) && bli_mem_is_alloc( &mem_bufY ))
|
||||
{
|
||||
//store the result from unit strided y_buf to non-unit strided Y
|
||||
for(dim_t y_index = 0 ; y_index < n_elem ; y_index++)
|
||||
{
|
||||
*(y + (y_index * incy)) = *(y_buf + y_index) ;
|
||||
}
|
||||
|
||||
#ifdef BLIS_ENABLE_MEM_TRACING
|
||||
printf( "bli_dgemv_unf_var2(): releasing mem pool block\n" );
|
||||
#endif
|
||||
// Return the buffer to pool
|
||||
bli_membrk_release(&rntm , &mem_bufY);
|
||||
}
|
||||
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
}
|
||||
|
||||
@@ -250,19 +316,19 @@ void bli_sgemv_unf_var2
|
||||
/* y = beta * y; */
|
||||
/* beta=0 case is hadled by scalv internally */
|
||||
|
||||
bli_sscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
bli_sscalv_zen_int10
|
||||
(
|
||||
BLIS_NO_CONJUGATE,
|
||||
n_elem,
|
||||
beta,
|
||||
y, incy,
|
||||
NULL
|
||||
);
|
||||
|
||||
if( bli_seq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Query the context for the kernel function pointer and fusing factor. */
|
||||
@@ -342,7 +408,7 @@ void bli_zgemv_unf_var2
|
||||
if( bli_zeq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3);
|
||||
return;
|
||||
return;
|
||||
}
|
||||
|
||||
/* fusing factor */
|
||||
@@ -418,8 +484,8 @@ void bli_cgemv_unf_var2
|
||||
|
||||
if( bli_ceq0( *alpha ) )
|
||||
{
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
AOCL_DTL_TRACE_EXIT(AOCL_DTL_LEVEL_TRACE_3)
|
||||
return;
|
||||
}
|
||||
/* fusing factor. */
|
||||
b_fuse = 4;
|
||||
|
||||
Reference in New Issue
Block a user