Updated level-1/-1f [vector intrinsic] kernels.

Details: - Updated level-1/-1f kernels so that non-unit and un-aligned cases are handled by reference implementation (rather than aborted). - Added -fomit-frame-pointer to default make_defs.mk for clarksville configuration. - Defined bli_offset_from_alignment() macro. - Minor edits to old test drivers.
2026-04-19 23:28:52 +00:00 · 2013-06-03 16:54:52 -05:00
parent 0288c827d3
commit 22b06cfcd2
15 changed files with 257 additions and 91 deletions
--- a/config/clarksville/make_defs.mk
+++ b/config/clarksville/make_defs.mk
@@ -82,7 +82,7 @@ CPPROCFLAGS  := -D_POSIX_C_SOURCE=200112L
 CMISCFLAGS   := -std=c99 # -fopenmp -pg
 CDBGFLAGS    := #-g
 CWARNFLAGS   := -Wall
-COPTFLAGS    := -O2 #-malign-double
+COPTFLAGS    := -O2 -fomit-frame-pointer
 CVECFLAGS    := -msse3 -march=nocona -mfpmath=sse

 # Aggregate all of the flags into two groups: one for optimizable code, and
--- a/frame/include/bli_param_macro_defs.h
+++ b/frame/include/bli_param_macro_defs.h
@@ -48,6 +48,10 @@
 \
 	( ( siz_t )(p) % (size) != 0 )

+#define bli_offset_from_alignment( p, size ) \
+\
+	( ( siz_t )(p) % (size) )
+

 // datatype

--- a/kernels/x86_64/1/bli_axpyv_opt_var1.c
+++ b/kernels/x86_64/1/bli_axpyv_opt_var1.c
@@ -194,9 +194,34 @@ void bli_dddaxpyv_opt_var1(
 	v2df_t            x1v, x2v, x3v, x4v;
 	v2df_t            y1v, y2v, y3v, y4v;

+	bool_t            use_ref = FALSE;
+
+
 	if ( bli_zero_dim1( n ) ) return;

+	n_pre = 0;
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
 	if ( incx != 1 || incy != 1 )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) )
+	{
+		use_ref = TRUE;
+
+		if ( bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_unaligned_to( y, 16 ) )
+		{
+			use_ref = FALSE;
+			n_pre   = 1;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
 	{
 		bli_dddaxpyv_unb_var1( conjx,
 		                       n,
@@ -206,13 +231,6 @@ void bli_dddaxpyv_opt_var1(
 		return;
 	}

-	n_pre = 0;
-	if ( ( unsigned long ) x % 16 != 0 )
-	{
-		if ( ( unsigned long ) y % 16 == 0 ) bli_abort();
-
-		n_pre = 1;
-	}

 	n_run       = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	n_left      = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
--- a/kernels/x86_64/1/bli_dotv_opt_var1.c
+++ b/kernels/x86_64/1/bli_dotv_opt_var1.c
@@ -211,25 +211,49 @@ void bli_ddddotv_opt_var1(
 	double            rho1;
 	double            x1c, y1c;

-	v2df_t             rho1v;
-	v2df_t             x1v, y1v;
+	v2df_t            rho1v;
+	v2df_t            x1v, y1v;

+	bool_t            use_ref = FALSE;
+
+	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) ) 
 	{ 
 		PASTEMAC(d,set0s)( *rho_cast ); 
 		return; 
 	} 

-	if ( incx != 1 ||
-	     incy != 1 ) bli_abort();
-
 	n_pre = 0;
-	if ( ( unsigned long ) y % 16 != 0 )
-	{
-		if ( ( unsigned long ) x % 16 == 0 )
-			bli_abort();

-		n_pre = 1;
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( incx != 1 || incy != 1 )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) )
+	{
+		use_ref = TRUE;
+
+		if ( bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_unaligned_to( y, 16 ) )
+		{
+			use_ref = FALSE;
+			n_pre = 1;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_ddddotv_unb_var1( conjx,
+		                      conjy,
+		                      n,
+		                      x, incx,
+		                      y, incy,
+		                      rho );
+		return;
 	}

 	n_run       = ( n - n_pre ) / 2;
--- a/kernels/x86_64/1f/bli_axpy2v_opt_var1.c
+++ b/kernels/x86_64/1f/bli_axpy2v_opt_var1.c
@@ -166,7 +166,7 @@ void bli_dddaxpy2v_opt_var1(
                            void*  beta,
                            void*  x, inc_t incx,
                            void*  y, inc_t incy,
-                            void*  z,  inc_t incz
+                            void*  z, inc_t incz
                          )
 {
 	double*  restrict alpha_cast  = alpha;
@@ -192,20 +192,48 @@ void bli_dddaxpy2v_opt_var1(
 	v2df_t            x1v, y1v, z1v;
 	v2df_t            x2v, y2v, z2v;

+	bool_t            use_ref = FALSE;
+
+
 	if ( bli_zero_dim1( n ) ) return;

-	if ( incx != 1 ||
-	     incy != 1 ||
-		 incz != 1 ) bli_abort();
-
 	n_pre = 0;
-	if ( ( unsigned long ) x % 16 != 0 )
-	{
-		if ( ( unsigned long ) y % 16 == 0 ||
-		     ( unsigned long ) z % 16 == 0 ) bli_abort();

-		n_pre = 1;
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( incx != 1 || incy != 1 || incz != 1 )
+	{
+		use_ref = TRUE;
 	}
+	else if ( bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) ||
+	          bli_is_unaligned_to( z, 16 ) )
+	{
+		use_ref = TRUE;
+
+		if ( bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_unaligned_to( y, 16 ) &&
+		     bli_is_unaligned_to( z, 16 ) )
+		{
+			use_ref = FALSE;
+			n_pre   = 1;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_dddaxpy2v_unb_var1( conjx,
+		                        conjy,
+		                        n,
+		                        alpha,
+		                        beta,
+		                        x, incx,
+		                        y, incy,
+		                        z, incz );
+		return;
+	}
+

 	n_run       = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	n_left      = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );
--- a/kernels/x86_64/1f/bli_axpyf_opt_var1.c
+++ b/kernels/x86_64/1f/bli_axpyf_opt_var1.c
@@ -199,9 +199,40 @@ void bli_dddaxpyf_opt_var1(
 	v2df_t            a10v, a11v, a12v, a13v, y1v;
 	v2df_t            chi0v, chi1v, chi2v, chi3v;

+	bool_t            use_ref = FALSE;
+
+
 	if ( bli_zero_dim2( m, b_n ) ) return;

+	m_pre = 0;
+
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
 	if ( b_n < PASTEMAC(d,axpyf_fuse_fac) )
+	{
+		use_ref = TRUE;
+	}
+	else if ( inca != 1 || incx != 1 || incy != 1 )
+	{
+		use_ref = TRUE;
+	}
+	else if ( bli_is_unaligned_to( a, 16 ) ||
+	          bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) )
+	{
+		use_ref = TRUE;
+
+		if ( bli_is_unaligned_to( a, 16 ) &&
+		     bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_unaligned_to( y, 16 ) )
+		{
+			use_ref = FALSE;
+			m_pre   = 1;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
 	{
 		PASTEMAC3(d,d,d,axpyf_unb_var1)( conja,
 		                                 conjx,
@@ -214,18 +245,6 @@ void bli_dddaxpyf_opt_var1(
 		return;
 	}

-	if ( inca != 1 ||
-	     incx != 1 ||
-	     incy != 1 ) bli_abort();
-
-	m_pre = 0;
-	if ( ( unsigned long ) a % 16 != 0 )
-	{
-		if ( ( unsigned long ) x % 16 == 0 ||
-		     ( unsigned long ) y % 16 == 0 ) bli_abort();
-
-		m_pre = 1;
-	}

 	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );
--- a/kernels/x86_64/1f/bli_dotaxpyv_opt_var1.c
+++ b/kernels/x86_64/1f/bli_dotaxpyv_opt_var1.c
@@ -128,23 +128,56 @@ void bli_ddddotaxpyv_opt_var1(
 	dim_t             i;
 	inc_t             stepx, stepy, stepz;

-	v2df_t    alphav, rhov;
-	v2df_t    x1v, y1v, z1v;
+	v2df_t            alphav, rhov;
+	v2df_t            x1v, y1v, z1v;

+	bool_t            use_ref = FALSE;
+
+	// If the vector lengths are zero, set rho to zero and return.
 	if ( bli_zero_dim1( n ) )
 	{
 		PASTEMAC(d,set0s)( *rho_cast );
 		return;
 	}

-   n_pre = 0;
-	if ( ( unsigned long ) x % 16 != 0 )
-	{
-		if ( ( unsigned long ) y % 16 == 0 ||
-		     ( unsigned long ) z % 16 == 0 ) bli_abort();
+	n_pre = 0;

-		n_pre = 1;
+	// If there is anything that would interfere with our use of aligned
+	// vector loads/stores, call the reference implementation.
+	if ( incx != 1 || incy != 1 || incz != 1 )
+	{
+		use_ref = TRUE;
 	}
+	else if ( bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) ||
+	          bli_is_unaligned_to( z, 16 ) )
+	{
+		use_ref = TRUE;
+
+		if ( bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_unaligned_to( y, 16 ) &&
+		     bli_is_unaligned_to( z, 16 ) )
+		{
+			use_ref = FALSE;
+			n_pre   = 1;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
+	{
+		bli_ddddotaxpyv_unb_var1( conjxt,
+		                          conjx,
+		                          conjy,
+		                          n,
+		                          alpha,
+		                          x, incx,
+		                          y, incy,
+		                          rho,
+		                          z, incz );
+		return;
+	}
+

 	n_run       = ( n - n_pre ) / ( 2 * 1 );
 	n_left      = ( n - n_pre ) % ( 2 * 1 );
--- a/kernels/x86_64/1f/bli_dotxaxpyf_opt_var1.c
+++ b/kernels/x86_64/1f/bli_dotxaxpyf_opt_var1.c
@@ -164,8 +164,12 @@ void bli_ddddotxaxpyf_opt_var1(
 	v2df_t            w2v, z2v;
 	v2df_t            psi0v, psi1v, betav, alphav;

+	bool_t            use_ref = FALSE;
+
+
 	if ( bli_zero_dim1( b_n ) ) return;

+	// If the vector lengths are zero, scale y by beta and return.
 	if ( bli_zero_dim1( m ) ) 
 	{ 
 		PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
@@ -175,7 +179,38 @@ void bli_ddddotxaxpyf_opt_var1(
 		return; 
 	} 

+    m_pre = 0;
+
+    // If there is anything that would interfere with our use of aligned
+    // vector loads/stores, call the reference implementation.
 	if ( b_n < PASTEMAC(d,dotxaxpyf_fuse_fac) )
+	{
+		use_ref = TRUE;
+	}
+    else if ( inca != 1 || incw != 1 || incx != 1 || incy != 1 || incz != 1 )
+    {
+        use_ref = TRUE;
+    }
+	else if ( bli_is_unaligned_to( a, 16 ) ||
+	          bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) ||
+	          bli_is_unaligned_to( z, 16 ) )
+	{
+		use_ref = TRUE;
+
+		if ( bli_is_unaligned_to( a, 16 ) &&
+		     bli_is_unaligned_to( w, 16 ) &&
+		     bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_unaligned_to( y, 16 ) &&
+		     bli_is_unaligned_to( z, 16 ) )
+		{
+			use_ref = FALSE;
+			m_pre   = 1;
+		}
+	}
+
+	if ( use_ref == TRUE )
 	{
 		PASTEMAC3(d,d,d,dotxaxpyf_unb_var1)( conjat,
 		                                     conja,
@@ -194,24 +229,6 @@ void bli_ddddotxaxpyf_opt_var1(
 	}


-	if ( inca != 1 ||
-	     incw != 1 ||
-	     incx != 1 ||
-	     incy != 1 ||
-	     incz != 1 ) bli_abort();
-
-	m_pre = 0;
-	if ( ( unsigned long ) a % 16 != 0 )
-	{
-		if ( ( unsigned long ) w % 16 == 0 ||
-		     ( unsigned long ) x % 16 == 0 ||
-		     ( unsigned long ) y % 16 == 0 ||
-		     ( unsigned long ) z % 16 == 0 )
-			bli_abort();
-
-		m_pre = 1;
-	}
-
 	m_run       = ( m - m_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	m_left      = ( m - m_pre ) % ( n_elem_per_reg * n_iter_unroll );

--- a/kernels/x86_64/1f/bli_dotxf_opt_var1.c
+++ b/kernels/x86_64/1f/bli_dotxf_opt_var1.c
@@ -220,18 +220,50 @@ void bli_ddddotxf_opt_var1(
 	v2df_t            rho0v, rho1v, rho2v, rho3v;
 	v2df_t            x0v, x1v, x2v, x3v, y0v, betav, alphav;

+	bool_t            use_ref = FALSE;
+
+
 	if ( bli_zero_dim1( b_m ) ) return;

+	// If the vector lengths are zero, scale r by beta and return.
 	if ( bli_zero_dim1( n ) ) 
 	{ 
-		PASTEMAC(d,scals)( *beta_cast, *(r_cast  ) ); 
-		PASTEMAC(d,scals)( *beta_cast, *(r_cast+1) ); 
-		PASTEMAC(d,scals)( *beta_cast, *(r_cast+2) ); 
-		PASTEMAC(d,scals)( *beta_cast, *(r_cast+3) ); 
+		PASTEMAC2(d,d,scalv)( BLIS_NO_CONJUGATE,
+		                      b_m,
+		                      beta_cast,
+		                      r_cast, incr );
 		return; 
 	} 

+    n_pre = 0;
+
+    // If there is anything that would interfere with our use of aligned
+    // vector loads/stores, call the reference implementation.
 	if ( b_m < PASTEMAC(d,dotxf_fuse_fac) )
+	{
+		use_ref = TRUE;
+	}
+    else if ( incx != 1 || incy != 1 || incr != 1 )
+    {
+        use_ref = TRUE;
+    }
+	else if ( bli_is_unaligned_to( x, 16 ) ||
+	          bli_is_unaligned_to( y, 16 ) ||
+	          bli_is_unaligned_to( r, 16 ) )
+	{
+		use_ref = TRUE;
+
+		if ( bli_is_unaligned_to( x, 16 ) &&
+		     bli_is_unaligned_to( y, 16 ) &&
+		     bli_is_aligned_to( r, 16 ) ) // Note: r is not affected by x and y being unaligned. 
+		{
+			use_ref = FALSE;
+			n_pre   = 1;
+		}
+	}
+
+	// Call the reference implementation if needed.
+	if ( use_ref == TRUE )
 	{
 		PASTEMAC3(d,d,d,dotxf_unb_var1)( conjx,
 		                                 conjy,
@@ -246,18 +278,6 @@ void bli_ddddotxf_opt_var1(
 	}


-	if ( incx != 1 ||
-	     incy != 1 ) bli_abort();
-
-	n_pre = 0;
-	if ( ( unsigned long ) y % 16 != 0 )
-	{
-		if ( ( unsigned long ) x % 16 == 0 )
-			bli_abort();
-
-		n_pre = 1;
-	}
-
 	n_run       = ( n - n_pre ) / ( n_elem_per_reg * n_iter_unroll );
 	n_left      = ( n - n_pre ) % ( n_elem_per_reg * n_iter_unroll );

--- a/test/Makefile
+++ b/test/Makefile
@@ -134,7 +134,7 @@ BLAS_LIB_PATH  := $(HOME)/flame/lib
 OPENBLAS_LIB   := $(BLAS_LIB_PATH)/libopenblas.a
 ATLAS_LIB      := $(BLAS_LIB_PATH)/libf77blas.a \
                  $(BLAS_LIB_PATH)/libatlas.a
-#MKL_LIB        := -L/opt/intel/mkl/10.2.2.025/lib/em64t/ \
+MKL_LIB        := -L$(HOME)/intel/mkl/lib/intel64/ \
                  -lmkl_sequential -lmkl_core -lmkl_intel_lp64


--- a/test/runme.sh
+++ b/test/runme.sh
@@ -1,11 +1,12 @@
 #!/bin/bash

 exec_root="test"
-out_root="output_sqaure"
+out_root="output"
+#out_root="output_square"

 # Operations to test.
-#l2_ops="gemv ger hemv her her2 trmv trsv"
-l3_ops="gemm" #"gemm hemm herk her2k trmm trsm"
+l2_ops="gemv ger hemv her her2 trmv trsv"
+l3_ops="gemm hemm herk her2k trmm trsm"
 test_ops="${l2_ops} ${l3_ops}"

 # Implementations to test
--- a/test/test_gemm.c
+++ b/test/test_gemm.c
@@ -89,7 +89,7 @@ int main( int argc, char** argv )

 	m_input = -1;
 	n_input = -1;
-	k_input = 200;
+	k_input = -1;
 	//k_input = 200;
 #else
 	p_begin = 16;
--- a/test/test_her2k.c
+++ b/test/test_her2k.c
@@ -86,7 +86,8 @@ int main( int argc, char** argv )
 	p_inc   = 40;

 	m_input = -1;
-	k_input = 200;
+	k_input = -1;
+	//k_input = 200;
 #else
 	p_begin = 16;
 	p_end   = 16;
--- a/test/test_herk.c
+++ b/test/test_herk.c
@@ -86,7 +86,8 @@ int main( int argc, char** argv )
 	p_inc   = 40;

 	m_input = -1;
-	k_input = 200;
+	k_input = -1;
+	//k_input = 200;
 #else
 	p_begin = 16;
 	p_end   = 16;
--- a/testsuite/input.general
+++ b/testsuite/input.general
@@ -3,9 +3,9 @@ c #rg    # Matrix storage scheme(s) to test ('c' = col-major; 'r' = row-major; '
 c #rji   # Vector storage scheme(s) to test ('c' = colvec/unit; 'r' = rowvec/unit; 'j' = colvec/non-unit; 'i' = rowvec/non-unit)
 1       # Test all combinations of storage schemes?
 32      # General stride spacing (for cases when testing general stride)
-sdcz    # Datatype(s) to test
+d #sdcz    # Datatype(s) to test
 100     # Problem size: first to test
-400     # Problem size: maximum to test
+500      # Problem size: maximum to test
 100     # Problem size: increment between experiments
 1       # Error-checking level (0 = disable error checking; 1 = full error checking)
 i       # Reaction to test failure ('i' = ignore; 's' = sleep() and continue; 'a' = abort)