mirror of
https://github.com/amd/blis.git
synced 2026-04-19 23:28:52 +00:00
Updated level-1/-1f [vector intrinsic] kernels.
Details: - Updated level-1/-1f kernels so that non-unit and un-aligned cases are handled by reference implementation (rather than aborted). - Added -fomit-frame-pointer to default make_defs.mk for clarksville configuration. - Defined bli_offset_from_alignment() macro. - Minor edits to old test drivers.
This commit is contained in:
@@ -82,7 +82,7 @@ CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
|
||||
CMISCFLAGS := -std=c99 # -fopenmp -pg
|
||||
CDBGFLAGS := #-g
|
||||
CWARNFLAGS := -Wall
|
||||
COPTFLAGS := -O2 #-malign-double
|
||||
COPTFLAGS := -O2 -fomit-frame-pointer
|
||||
CVECFLAGS := -msse3 -march=nocona -mfpmath=sse
|
||||
|
||||
# Aggregate all of the flags into two groups: one for optimizable code, and
|
||||
|
||||
@@ -48,6 +48,10 @@
|
||||
\
|
||||
( ( siz_t )(p) % (size) != 0 )
|
||||
|
||||
#define bli_offset_from_alignment( p, size ) \
|
||||
\
|
||||
( ( siz_t )(p) % (size) )
|
||||
|
||||
|
||||
// datatype
|
||||
|
||||
|
||||
@@ -194,9 +194,34 @@ void bli_dddaxpyv_opt_var1(
|
||||
v2df_t x1v, x2v, x3v, x4v;
|
||||
v2df_t y1v, y2v, y3v, y4v;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
n_pre = 0;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_dddaxpyv_unb_var1( conjx,
|
||||
n,
|
||||
@@ -206,13 +231,6 @@ void bli_dddaxpyv_opt_var1(
|
||||
return;
|
||||
}
|
||||
|
||||
n_pre = 0;
|
||||
if ( ( unsigned long ) x % 16 != 0 )
|
||||
{
|
||||
if ( ( unsigned long ) y % 16 == 0 ) bli_abort();
|
||||
|
||||
n_pre = 1;
|
||||
}
|
||||
|
||||
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
@@ -211,25 +211,49 @@ void bli_ddddotv_opt_var1(
|
||||
double rho1;
|
||||
double x1c, y1c;
|
||||
|
||||
v2df_t rho1v;
|
||||
v2df_t x1v, y1v;
|
||||
v2df_t rho1v;
|
||||
v2df_t x1v, y1v;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
// If the vector lengths are zero, set rho to zero and return.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(d,set0s)( *rho_cast );
|
||||
return;
|
||||
}
|
||||
|
||||
if ( incx != 1 ||
|
||||
incy != 1 ) bli_abort();
|
||||
|
||||
n_pre = 0;
|
||||
if ( ( unsigned long ) y % 16 != 0 )
|
||||
{
|
||||
if ( ( unsigned long ) x % 16 == 0 )
|
||||
bli_abort();
|
||||
|
||||
n_pre = 1;
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( incx != 1 || incy != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_ddddotv_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho );
|
||||
return;
|
||||
}
|
||||
|
||||
n_run = ( n - n_pre ) / 2;
|
||||
|
||||
@@ -166,7 +166,7 @@ void bli_dddaxpy2v_opt_var1(
|
||||
void* beta,
|
||||
void* x, inc_t incx,
|
||||
void* y, inc_t incy,
|
||||
void* z, inc_t incz
|
||||
void* z, inc_t incz
|
||||
)
|
||||
{
|
||||
double* restrict alpha_cast = alpha;
|
||||
@@ -192,20 +192,48 @@ void bli_dddaxpy2v_opt_var1(
|
||||
v2df_t x1v, y1v, z1v;
|
||||
v2df_t x2v, y2v, z2v;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
|
||||
if ( bli_zero_dim1( n ) ) return;
|
||||
|
||||
if ( incx != 1 ||
|
||||
incy != 1 ||
|
||||
incz != 1 ) bli_abort();
|
||||
|
||||
n_pre = 0;
|
||||
if ( ( unsigned long ) x % 16 != 0 )
|
||||
{
|
||||
if ( ( unsigned long ) y % 16 == 0 ||
|
||||
( unsigned long ) z % 16 == 0 ) bli_abort();
|
||||
|
||||
n_pre = 1;
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( incx != 1 || incy != 1 || incz != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) ||
|
||||
bli_is_unaligned_to( z, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) &&
|
||||
bli_is_unaligned_to( z, 16 ) )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_dddaxpy2v_unb_var1( conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
beta,
|
||||
x, incx,
|
||||
y, incy,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
@@ -199,9 +199,40 @@ void bli_dddaxpyf_opt_var1(
|
||||
v2df_t a10v, a11v, a12v, a13v, y1v;
|
||||
v2df_t chi0v, chi1v, chi2v, chi3v;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
|
||||
if ( bli_zero_dim2( m, b_n ) ) return;
|
||||
|
||||
m_pre = 0;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( b_n < PASTEMAC(d,axpyf_fuse_fac) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( inca != 1 || incx != 1 || incy != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( a, 16 ) ||
|
||||
bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( a, 16 ) &&
|
||||
bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
m_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
PASTEMAC3(d,d,d,axpyf_unb_var1)( conja,
|
||||
conjx,
|
||||
@@ -214,18 +245,6 @@ void bli_dddaxpyf_opt_var1(
|
||||
return;
|
||||
}
|
||||
|
||||
if ( inca != 1 ||
|
||||
incx != 1 ||
|
||||
incy != 1 ) bli_abort();
|
||||
|
||||
m_pre = 0;
|
||||
if ( ( unsigned long ) a % 16 != 0 )
|
||||
{
|
||||
if ( ( unsigned long ) x % 16 == 0 ||
|
||||
( unsigned long ) y % 16 == 0 ) bli_abort();
|
||||
|
||||
m_pre = 1;
|
||||
}
|
||||
|
||||
m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
@@ -128,23 +128,56 @@ void bli_ddddotaxpyv_opt_var1(
|
||||
dim_t i;
|
||||
inc_t stepx, stepy, stepz;
|
||||
|
||||
v2df_t alphav, rhov;
|
||||
v2df_t x1v, y1v, z1v;
|
||||
v2df_t alphav, rhov;
|
||||
v2df_t x1v, y1v, z1v;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
// If the vector lengths are zero, set rho to zero and return.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(d,set0s)( *rho_cast );
|
||||
return;
|
||||
}
|
||||
|
||||
n_pre = 0;
|
||||
if ( ( unsigned long ) x % 16 != 0 )
|
||||
{
|
||||
if ( ( unsigned long ) y % 16 == 0 ||
|
||||
( unsigned long ) z % 16 == 0 ) bli_abort();
|
||||
n_pre = 0;
|
||||
|
||||
n_pre = 1;
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( incx != 1 || incy != 1 || incz != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) ||
|
||||
bli_is_unaligned_to( z, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) &&
|
||||
bli_is_unaligned_to( z, 16 ) )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
bli_ddddotaxpyv_unb_var1( conjxt,
|
||||
conjx,
|
||||
conjy,
|
||||
n,
|
||||
alpha,
|
||||
x, incx,
|
||||
y, incy,
|
||||
rho,
|
||||
z, incz );
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
n_run = ( n - n_pre ) / ( 2 * 1 );
|
||||
n_left = ( n - n_pre ) % ( 2 * 1 );
|
||||
|
||||
@@ -164,8 +164,12 @@ void bli_ddddotxaxpyf_opt_var1(
|
||||
v2df_t w2v, z2v;
|
||||
v2df_t psi0v, psi1v, betav, alphav;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
|
||||
if ( bli_zero_dim1( b_n ) ) return;
|
||||
|
||||
// If the vector lengths are zero, scale y by beta and return.
|
||||
if ( bli_zero_dim1( m ) )
|
||||
{
|
||||
PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
|
||||
@@ -175,7 +179,38 @@ void bli_ddddotxaxpyf_opt_var1(
|
||||
return;
|
||||
}
|
||||
|
||||
m_pre = 0;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( b_n < PASTEMAC(d,dotxaxpyf_fuse_fac) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( inca != 1 || incw != 1 || incx != 1 || incy != 1 || incz != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( a, 16 ) ||
|
||||
bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) ||
|
||||
bli_is_unaligned_to( z, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( a, 16 ) &&
|
||||
bli_is_unaligned_to( w, 16 ) &&
|
||||
bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) &&
|
||||
bli_is_unaligned_to( z, 16 ) )
|
||||
{
|
||||
use_ref = FALSE;
|
||||
m_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
PASTEMAC3(d,d,d,dotxaxpyf_unb_var1)( conjat,
|
||||
conja,
|
||||
@@ -194,24 +229,6 @@ void bli_ddddotxaxpyf_opt_var1(
|
||||
}
|
||||
|
||||
|
||||
if ( inca != 1 ||
|
||||
incw != 1 ||
|
||||
incx != 1 ||
|
||||
incy != 1 ||
|
||||
incz != 1 ) bli_abort();
|
||||
|
||||
m_pre = 0;
|
||||
if ( ( unsigned long ) a % 16 != 0 )
|
||||
{
|
||||
if ( ( unsigned long ) w % 16 == 0 ||
|
||||
( unsigned long ) x % 16 == 0 ||
|
||||
( unsigned long ) y % 16 == 0 ||
|
||||
( unsigned long ) z % 16 == 0 )
|
||||
bli_abort();
|
||||
|
||||
m_pre = 1;
|
||||
}
|
||||
|
||||
m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
|
||||
@@ -220,18 +220,50 @@ void bli_ddddotxf_opt_var1(
|
||||
v2df_t rho0v, rho1v, rho2v, rho3v;
|
||||
v2df_t x0v, x1v, x2v, x3v, y0v, betav, alphav;
|
||||
|
||||
bool_t use_ref = FALSE;
|
||||
|
||||
|
||||
if ( bli_zero_dim1( b_m ) ) return;
|
||||
|
||||
// If the vector lengths are zero, scale r by beta and return.
|
||||
if ( bli_zero_dim1( n ) )
|
||||
{
|
||||
PASTEMAC(d,scals)( *beta_cast, *(r_cast ) );
|
||||
PASTEMAC(d,scals)( *beta_cast, *(r_cast+1) );
|
||||
PASTEMAC(d,scals)( *beta_cast, *(r_cast+2) );
|
||||
PASTEMAC(d,scals)( *beta_cast, *(r_cast+3) );
|
||||
PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
|
||||
b_m,
|
||||
beta_cast,
|
||||
r_cast, incr );
|
||||
return;
|
||||
}
|
||||
|
||||
n_pre = 0;
|
||||
|
||||
// If there is anything that would interfere with our use of aligned
|
||||
// vector loads/stores, call the reference implementation.
|
||||
if ( b_m < PASTEMAC(d,dotxf_fuse_fac) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( incx != 1 || incy != 1 || incr != 1 )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
}
|
||||
else if ( bli_is_unaligned_to( x, 16 ) ||
|
||||
bli_is_unaligned_to( y, 16 ) ||
|
||||
bli_is_unaligned_to( r, 16 ) )
|
||||
{
|
||||
use_ref = TRUE;
|
||||
|
||||
if ( bli_is_unaligned_to( x, 16 ) &&
|
||||
bli_is_unaligned_to( y, 16 ) &&
|
||||
bli_is_aligned_to( r, 16 ) ) // Note: r is not affected by x and y being unaligned.
|
||||
{
|
||||
use_ref = FALSE;
|
||||
n_pre = 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Call the reference implementation if needed.
|
||||
if ( use_ref == TRUE )
|
||||
{
|
||||
PASTEMAC3(d,d,d,dotxf_unb_var1)( conjx,
|
||||
conjy,
|
||||
@@ -246,18 +278,6 @@ void bli_ddddotxf_opt_var1(
|
||||
}
|
||||
|
||||
|
||||
if ( incx != 1 ||
|
||||
incy != 1 ) bli_abort();
|
||||
|
||||
n_pre = 0;
|
||||
if ( ( unsigned long ) y % 16 != 0 )
|
||||
{
|
||||
if ( ( unsigned long ) x % 16 == 0 )
|
||||
bli_abort();
|
||||
|
||||
n_pre = 1;
|
||||
}
|
||||
|
||||
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
|
||||
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
|
||||
|
||||
|
||||
@@ -134,7 +134,7 @@ BLAS_LIB_PATH := $(HOME)/flame/lib
|
||||
OPENBLAS_LIB := $(BLAS_LIB_PATH)/libopenblas.a
|
||||
ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \
|
||||
$(BLAS_LIB_PATH)/libatlas.a
|
||||
#MKL_LIB := -L/opt/intel/mkl/10.2.2.025/lib/em64t/ \
|
||||
MKL_LIB := -L$(HOME)/intel/mkl/lib/intel64/ \
|
||||
-lmkl_sequential -lmkl_core -lmkl_intel_lp64
|
||||
|
||||
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
#!/bin/bash
|
||||
|
||||
exec_root="test"
|
||||
out_root="output_sqaure"
|
||||
out_root="output"
|
||||
#out_root="output_square"
|
||||
|
||||
# Operations to test.
|
||||
#l2_ops="gemv ger hemv her her2 trmv trsv"
|
||||
l3_ops="gemm" #"gemm hemm herk her2k trmm trsm"
|
||||
l2_ops="gemv ger hemv her her2 trmv trsv"
|
||||
l3_ops="gemm hemm herk her2k trmm trsm"
|
||||
test_ops="${l2_ops} ${l3_ops}"
|
||||
|
||||
# Implementations to test
|
||||
|
||||
@@ -89,7 +89,7 @@ int main( int argc, char** argv )
|
||||
|
||||
m_input = -1;
|
||||
n_input = -1;
|
||||
k_input = 200;
|
||||
k_input = -1;
|
||||
//k_input = 200;
|
||||
#else
|
||||
p_begin = 16;
|
||||
|
||||
@@ -86,7 +86,8 @@ int main( int argc, char** argv )
|
||||
p_inc = 40;
|
||||
|
||||
m_input = -1;
|
||||
k_input = 200;
|
||||
k_input = -1;
|
||||
//k_input = 200;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
|
||||
@@ -86,7 +86,8 @@ int main( int argc, char** argv )
|
||||
p_inc = 40;
|
||||
|
||||
m_input = -1;
|
||||
k_input = 200;
|
||||
k_input = -1;
|
||||
//k_input = 200;
|
||||
#else
|
||||
p_begin = 16;
|
||||
p_end = 16;
|
||||
|
||||
@@ -3,9 +3,9 @@ c #rg # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major; '
|
||||
c #rji # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
|
||||
1 # Test all combinations of storage schemes?
|
||||
32 # General stride spacing (for cases when testing general stride)
|
||||
sdcz # Datatype(s) to test
|
||||
d #sdcz # Datatype(s) to test
|
||||
100 # Problem size: first to test
|
||||
400 # Problem size: maximum to test
|
||||
500 # Problem size: maximum to test
|
||||
100 # Problem size: increment between experiments
|
||||
1 # Error-checking level (0 = disable error checking; 1 = full error checking)
|
||||
i # Reaction to test failure ('i' = ignore; 's' = sleep() and continue; 'a' = abort)
|
||||
|
||||
Reference in New Issue
Block a user