Updated level-1/-1f [vector intrinsic] kernels.

Details:
- Updated level-1/-1f kernels so that non-unit and un-aligned cases are
  handled by reference implementation (rather than aborted).
- Added -fomit-frame-pointer to default make_defs.mk for clarksville
  configuration.
- Defined bli_offset_from_alignment() macro.
- Minor edits to old test drivers.
This commit is contained in:
Field G. Van Zee
2013-06-03 16:54:52 -05:00
parent 0288c827d3
commit 22b06cfcd2
15 changed files with 257 additions and 91 deletions

View File

@@ -82,7 +82,7 @@ CPPROCFLAGS := -D_POSIX_C_SOURCE=200112L
CMISCFLAGS := -std=c99 # -fopenmp -pg
CDBGFLAGS := #-g
CWARNFLAGS := -Wall
COPTFLAGS := -O2 #-malign-double
COPTFLAGS := -O2 -fomit-frame-pointer
CVECFLAGS := -msse3 -march=nocona -mfpmath=sse
# Aggregate all of the flags into two groups: one for optimizable code, and

View File

@@ -48,6 +48,10 @@
\
( ( siz_t )(p) % (size) != 0 )
#define bli_offset_from_alignment( p, size ) \
\
( ( siz_t )(p) % (size) )
// datatype

View File

@@ -194,9 +194,34 @@ void bli_dddaxpyv_opt_var1(
v2df_t x1v, x2v, x3v, x4v;
v2df_t y1v, y2v, y3v, y4v;
bool_t use_ref = FALSE;
if ( bli_zero_dim1( n ) ) return;
n_pre = 0;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( incx != 1 || incy != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) )
{
use_ref = FALSE;
n_pre = 1;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_dddaxpyv_unb_var1( conjx,
n,
@@ -206,13 +231,6 @@ void bli_dddaxpyv_opt_var1(
return;
}
n_pre = 0;
if ( ( unsigned long ) x % 16 != 0 )
{
if ( ( unsigned long ) y % 16 == 0 ) bli_abort();
n_pre = 1;
}
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );

View File

@@ -211,25 +211,49 @@ void bli_ddddotv_opt_var1(
double rho1;
double x1c, y1c;
v2df_t rho1v;
v2df_t x1v, y1v;
v2df_t rho1v;
v2df_t x1v, y1v;
bool_t use_ref = FALSE;
// If the vector lengths are zero, set rho to zero and return.
if ( bli_zero_dim1( n ) )
{
PASTEMAC(d,set0s)( *rho_cast );
return;
}
if ( incx != 1 ||
incy != 1 ) bli_abort();
n_pre = 0;
if ( ( unsigned long ) y % 16 != 0 )
{
if ( ( unsigned long ) x % 16 == 0 )
bli_abort();
n_pre = 1;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( incx != 1 || incy != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) )
{
use_ref = FALSE;
n_pre = 1;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_ddddotv_unb_var1( conjx,
conjy,
n,
x, incx,
y, incy,
rho );
return;
}
n_run = ( n - n_pre ) / 2;

View File

@@ -166,7 +166,7 @@ void bli_dddaxpy2v_opt_var1(
void* beta,
void* x, inc_t incx,
void* y, inc_t incy,
void* z, inc_t incz
void* z, inc_t incz
)
{
double* restrict alpha_cast = alpha;
@@ -192,20 +192,48 @@ void bli_dddaxpy2v_opt_var1(
v2df_t x1v, y1v, z1v;
v2df_t x2v, y2v, z2v;
bool_t use_ref = FALSE;
if ( bli_zero_dim1( n ) ) return;
if ( incx != 1 ||
incy != 1 ||
incz != 1 ) bli_abort();
n_pre = 0;
if ( ( unsigned long ) x % 16 != 0 )
{
if ( ( unsigned long ) y % 16 == 0 ||
( unsigned long ) z % 16 == 0 ) bli_abort();
n_pre = 1;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( incx != 1 || incy != 1 || incz != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) ||
bli_is_unaligned_to( z, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) &&
bli_is_unaligned_to( z, 16 ) )
{
use_ref = FALSE;
n_pre = 1;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_dddaxpy2v_unb_var1( conjx,
conjy,
n,
alpha,
beta,
x, incx,
y, incy,
z, incz );
return;
}
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );

View File

@@ -199,9 +199,40 @@ void bli_dddaxpyf_opt_var1(
v2df_t a10v, a11v, a12v, a13v, y1v;
v2df_t chi0v, chi1v, chi2v, chi3v;
bool_t use_ref = FALSE;
if ( bli_zero_dim2( m, b_n ) ) return;
m_pre = 0;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < PASTEMAC(d,axpyf_fuse_fac) )
{
use_ref = TRUE;
}
else if ( inca != 1 || incx != 1 || incy != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, 16 ) ||
bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( a, 16 ) &&
bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) )
{
use_ref = FALSE;
m_pre = 1;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
PASTEMAC3(d,d,d,axpyf_unb_var1)( conja,
conjx,
@@ -214,18 +245,6 @@ void bli_dddaxpyf_opt_var1(
return;
}
if ( inca != 1 ||
incx != 1 ||
incy != 1 ) bli_abort();
m_pre = 0;
if ( ( unsigned long ) a % 16 != 0 )
{
if ( ( unsigned long ) x % 16 == 0 ||
( unsigned long ) y % 16 == 0 ) bli_abort();
m_pre = 1;
}
m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );

View File

@@ -128,23 +128,56 @@ void bli_ddddotaxpyv_opt_var1(
dim_t i;
inc_t stepx, stepy, stepz;
v2df_t alphav, rhov;
v2df_t x1v, y1v, z1v;
v2df_t alphav, rhov;
v2df_t x1v, y1v, z1v;
bool_t use_ref = FALSE;
// If the vector lengths are zero, set rho to zero and return.
if ( bli_zero_dim1( n ) )
{
PASTEMAC(d,set0s)( *rho_cast );
return;
}
n_pre = 0;
if ( ( unsigned long ) x % 16 != 0 )
{
if ( ( unsigned long ) y % 16 == 0 ||
( unsigned long ) z % 16 == 0 ) bli_abort();
n_pre = 0;
n_pre = 1;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( incx != 1 || incy != 1 || incz != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) ||
bli_is_unaligned_to( z, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) &&
bli_is_unaligned_to( z, 16 ) )
{
use_ref = FALSE;
n_pre = 1;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
bli_ddddotaxpyv_unb_var1( conjxt,
conjx,
conjy,
n,
alpha,
x, incx,
y, incy,
rho,
z, incz );
return;
}
n_run = ( n - n_pre ) / ( 2 * 1 );
n_left = ( n - n_pre ) % ( 2 * 1 );

View File

@@ -164,8 +164,12 @@ void bli_ddddotxaxpyf_opt_var1(
v2df_t w2v, z2v;
v2df_t psi0v, psi1v, betav, alphav;
bool_t use_ref = FALSE;
if ( bli_zero_dim1( b_n ) ) return;
// If the vector lengths are zero, scale y by beta and return.
if ( bli_zero_dim1( m ) )
{
PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
@@ -175,7 +179,38 @@ void bli_ddddotxaxpyf_opt_var1(
return;
}
m_pre = 0;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_n < PASTEMAC(d,dotxaxpyf_fuse_fac) )
{
use_ref = TRUE;
}
else if ( inca != 1 || incw != 1 || incx != 1 || incy != 1 || incz != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( a, 16 ) ||
bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) ||
bli_is_unaligned_to( z, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( a, 16 ) &&
bli_is_unaligned_to( w, 16 ) &&
bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) &&
bli_is_unaligned_to( z, 16 ) )
{
use_ref = FALSE;
m_pre = 1;
}
}
if ( use_ref == TRUE )
{
PASTEMAC3(d,d,d,dotxaxpyf_unb_var1)( conjat,
conja,
@@ -194,24 +229,6 @@ void bli_ddddotxaxpyf_opt_var1(
}
if ( inca != 1 ||
incw != 1 ||
incx != 1 ||
incy != 1 ||
incz != 1 ) bli_abort();
m_pre = 0;
if ( ( unsigned long ) a % 16 != 0 )
{
if ( ( unsigned long ) w % 16 == 0 ||
( unsigned long ) x % 16 == 0 ||
( unsigned long ) y % 16 == 0 ||
( unsigned long ) z % 16 == 0 )
bli_abort();
m_pre = 1;
}
m_run = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
m_left = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );

View File

@@ -220,18 +220,50 @@ void bli_ddddotxf_opt_var1(
v2df_t rho0v, rho1v, rho2v, rho3v;
v2df_t x0v, x1v, x2v, x3v, y0v, betav, alphav;
bool_t use_ref = FALSE;
if ( bli_zero_dim1( b_m ) ) return;
// If the vector lengths are zero, scale r by beta and return.
if ( bli_zero_dim1( n ) )
{
PASTEMAC(d,scals)( *beta_cast, *(r_cast ) );
PASTEMAC(d,scals)( *beta_cast, *(r_cast+1) );
PASTEMAC(d,scals)( *beta_cast, *(r_cast+2) );
PASTEMAC(d,scals)( *beta_cast, *(r_cast+3) );
PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
b_m,
beta_cast,
r_cast, incr );
return;
}
n_pre = 0;
// If there is anything that would interfere with our use of aligned
// vector loads/stores, call the reference implementation.
if ( b_m < PASTEMAC(d,dotxf_fuse_fac) )
{
use_ref = TRUE;
}
else if ( incx != 1 || incy != 1 || incr != 1 )
{
use_ref = TRUE;
}
else if ( bli_is_unaligned_to( x, 16 ) ||
bli_is_unaligned_to( y, 16 ) ||
bli_is_unaligned_to( r, 16 ) )
{
use_ref = TRUE;
if ( bli_is_unaligned_to( x, 16 ) &&
bli_is_unaligned_to( y, 16 ) &&
bli_is_aligned_to( r, 16 ) ) // Note: r is not affected by x and y being unaligned.
{
use_ref = FALSE;
n_pre = 1;
}
}
// Call the reference implementation if needed.
if ( use_ref == TRUE )
{
PASTEMAC3(d,d,d,dotxf_unb_var1)( conjx,
conjy,
@@ -246,18 +278,6 @@ void bli_ddddotxf_opt_var1(
}
if ( incx != 1 ||
incy != 1 ) bli_abort();
n_pre = 0;
if ( ( unsigned long ) y % 16 != 0 )
{
if ( ( unsigned long ) x % 16 == 0 )
bli_abort();
n_pre = 1;
}
n_run = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
n_left = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );

View File

@@ -134,7 +134,7 @@ BLAS_LIB_PATH := $(HOME)/flame/lib
OPENBLAS_LIB := $(BLAS_LIB_PATH)/libopenblas.a
ATLAS_LIB := $(BLAS_LIB_PATH)/libf77blas.a \
$(BLAS_LIB_PATH)/libatlas.a
#MKL_LIB := -L/opt/intel/mkl/10.2.2.025/lib/em64t/ \
MKL_LIB := -L$(HOME)/intel/mkl/lib/intel64/ \
-lmkl_sequential -lmkl_core -lmkl_intel_lp64

View File

@@ -1,11 +1,12 @@
#!/bin/bash
exec_root="test"
out_root="output_sqaure"
out_root="output"
#out_root="output_square"
# Operations to test.
#l2_ops="gemv ger hemv her her2 trmv trsv"
l3_ops="gemm" #"gemm hemm herk her2k trmm trsm"
l2_ops="gemv ger hemv her her2 trmv trsv"
l3_ops="gemm hemm herk her2k trmm trsm"
test_ops="${l2_ops} ${l3_ops}"
# Implementations to test

View File

@@ -89,7 +89,7 @@ int main( int argc, char** argv )
m_input = -1;
n_input = -1;
k_input = 200;
k_input = -1;
//k_input = 200;
#else
p_begin = 16;

View File

@@ -86,7 +86,8 @@ int main( int argc, char** argv )
p_inc = 40;
m_input = -1;
k_input = 200;
k_input = -1;
//k_input = 200;
#else
p_begin = 16;
p_end = 16;

View File

@@ -86,7 +86,8 @@ int main( int argc, char** argv )
p_inc = 40;
m_input = -1;
k_input = 200;
k_input = -1;
//k_input = 200;
#else
p_begin = 16;
p_end = 16;

View File

@@ -3,9 +3,9 @@ c #rg # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major; '
c #rji # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
1 # Test all combinations of storage schemes?
32 # General stride spacing (for cases when testing general stride)
sdcz # Datatype(s) to test
d #sdcz # Datatype(s) to test
100 # Problem size: first to test
400 # Problem size: maximum to test
500 # Problem size: maximum to test
100 # Problem size: increment between experiments
1 # Error-checking level (0 = disable error checking; 1 = full error checking)
i # Reaction to test failure ('i' = ignore; 's' = sleep() and continue; 'a' = abort)