Merge branch 'amd' into rt

Details: - Merged contributions made by AMD via 'amd' branch (see summary below). Special thanks to AMD for their contributions to-date, especially with regard to intrinsic- and assembly-based kernels. - Added column storage output cases to microkernels in bli_gemm_zen_asm_d6x8.c and bli_gemmtrsm_l_zen_asm_d6x8.c. Even with the extra cost of transposing the microtile in registers, this is much faster than using the general storage case when the underlying matrix is column-stored. - Added s and d assembly-based zen gemmtrsm_u microkernel (including column storage optimization mentioned above). - Updated zen sub-configuration to reflect presence of new native kernels. - Temporarily reverted zen sub-configuration's level-3 cache blocksizes to smaller haswell values. - Temporarily disabled small matrix handling for zen configuration family in config/zen/bli_family_zen.h. - Updated zen CFLAGS according to changes in 1e4365b. - Updated haswell microkernels such that: - only one vzeroupper instruction is called prior to returning - movapd/movupd are used in leiu of movaps/movups for double-real microkernels. (Note that single-real microkernels still use movaps/movups.) - Added kernel prototypes to kernels/zen/bli_kernels_zen.h, which is now included via frame/include/bli_arch_config.h. - Minor updates to bli_amaxv_ref.c (and to inlined "test" implementation in testsuite/src/test_amaxv.c). - Added early return for alpha == 0 in bli_dotxv_ref.c. - Integrated changes from f07b176, including a fix for undefined behavior when executing the 1m method under certain conditions. - Updated config_registry; no longer need haswell kernels for zen sub-configuration. - Tweaked marginal and pass thresholds for dotxf. - Reformatted level-1v, -1f, and -3 amd kernels and inserted additional comments. - Updated LICENSE file to explicitly mention that parts are copyright UT-Austin and AMD. - Added AMD copyright to header templates in build/templates. Summary of previous changes from 'amd' branch. - Added s and d assembly-based zen gemm microkernels (d6x8 and d8x6) and s and d assembly-based zen gemmtrsm_l microkernels (d6x8). - Added s and d intrinsics-based zen kernels for amaxv, axpyv, dotv, dotxv, and scalv, with extra-unrolling variants for axpyv and scalv. - Added a small matrix handler to bli_gemm_front(), with the handler implemented in kernels/zen/3/bli_gemm_small_matrix.c. - Added additional logic to sumsqv that first attempts to compute the sum of the squares via dotv(). If there is a floating-point exception (FE_OVERFLOW), then the previous (numerically conservative) code is used; otherwise, the result of dotv() is square-rooted and stored as the result. This new implementation is only enabled when FE_OVERFLOW is #defined. If the macro is not #defined, then the previous implementation is used. - Added axpyv and dotv standalone test drivers to test directory. - Added zen support to old cpuid_x86.c driver in build/auto-detect/old. - Added thread-local and __attribute__-related macros to bli_macro_defs.h.
2026-05-11 01:30:00 +00:00 · 2018-02-21 17:43:32 -06:00
parent fa74af4e1f 5a7005dd44
commit 16813335bd
46 changed files with 17161 additions and 255 deletions
--- a/frame/util/bli_util_unb_var1.c
+++ b/frame/util/bli_util_unb_var1.c
@@ -5,6 +5,7 @@
   libraries.

   Copyright (C) 2014, The University of Texas at Austin
+   Copyright (C) 2017, Advanced Micro Devices, Inc.

   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions are
@@ -33,6 +34,7 @@
 */

 #include "blis.h"
+#include <fenv.h>

 //
 // Define BLAS-like interfaces with typed operands.
@@ -293,7 +295,129 @@ void PASTEMAC(ch,varname) \
 	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
 }

-INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
+//INSERT_GENTFUNCR_BASIC( normfv_unb_var1, sumsqv_unb_var1 )
+GENTFUNCR( scomplex, float,  c, s, normfv_unb_var1, sumsqv_unb_var1 )
+GENTFUNCR( dcomplex, double, z, d, normfv_unb_var1, sumsqv_unb_var1 )
+
+#undef  GENTFUNCR
+#ifdef FE_OVERFLOW
+#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx  \
+     ) \
+{ \
+	ctype_r* zero       = PASTEMAC(chr,0); \
+	ctype_r* one        = PASTEMAC(chr,1); \
+	ctype_r  scale; \
+	ctype_r  sumsq; \
+	ctype_r  sqrt_sumsq; \
+\
+	/* Initialize scale and sumsq to begin the summation. */ \
+	PASTEMAC(chr,copys)( *zero, scale ); \
+	PASTEMAC(chr,copys)( *one,  sumsq ); \
+\
+	/* An optimization: first try to use dotv to compute the sum of
+	   the squares of the vector. If no floating-point exceptions
+	   (specifically, overflow and invalid exceptions) were produced,
+	   then we accept the computed value and returne early. The cost
+	   of this optimization is the "sunk" cost of the initial dotv
+	   when sumsqv must be used instead. However, we expect that the
+	   vast majority of use cases will not produce exceptions, and
+	   therefore only one pass through the data, via dotv, will be
+	   required. */ \
+	if ( TRUE ) \
+	{ \
+		int      f_exp_raised;\
+		ctype    sumsqc; \
+\
+		feclearexcept( FE_ALL_EXCEPT );\
+\
+		PASTEMAC(ch,dotv)\
+		( \
+		  BLIS_NO_CONJUGATE, \
+		  BLIS_NO_CONJUGATE, \
+		  n,\
+		  x, incx, \
+		  x, incx, \
+		  &sumsqc, \
+		  cntx  \
+		); \
+\
+		PASTEMAC2(ch,chr,copys)( sumsqc, sumsq ); \
+\
+		f_exp_raised = fetestexcept( FE_OVERFLOW | FE_INVALID );\
+\
+		if ( !f_exp_raised ) \
+		{ \
+		    PASTEMAC(chr,sqrt2s)( sumsq, *norm ); \
+		    return; \
+		} \
+	} \
+\
+	/* Compute the sum of the squares of the vector. */ \
+	PASTEMAC(ch,kername) \
+	( \
+	  n, \
+	  x, incx, \
+	  &scale, \
+	  &sumsq, \
+	  cntx  \
+	); \
+\
+	/* Compute: norm = scale * sqrt( sumsq ) */ \
+	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
+	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+\
+	/* Store the final value to the output variable. */ \
+	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+}
+#else
+#define GENTFUNCR( ctype, ctype_r, ch, chr, varname, kername ) \
+\
+void PASTEMAC(ch,varname) \
+     ( \
+       dim_t    n, \
+       ctype*   x, inc_t incx, \
+       ctype_r* norm, \
+       cntx_t*  cntx  \
+     ) \
+{ \
+	ctype_r* zero       = PASTEMAC(chr,0); \
+	ctype_r* one        = PASTEMAC(chr,1); \
+	ctype_r  scale; \
+	ctype_r  sumsq; \
+	ctype_r  sqrt_sumsq; \
+\
+	/* Initialize scale and sumsq to begin the summation. */ \
+	PASTEMAC(chr,copys)( *zero, scale ); \
+	PASTEMAC(chr,copys)( *one,  sumsq ); \
+\
+	/* Compute the sum of the squares of the vector. */ \
+\
+	PASTEMAC(ch,kername) \
+	( \
+	  n, \
+	  x, incx, \
+	  &scale, \
+	  &sumsq, \
+	  cntx  \
+	); \
+\
+	/* Compute: norm = scale * sqrt( sumsq ) */ \
+	PASTEMAC(chr,sqrt2s)( sumsq, sqrt_sumsq ); \
+	PASTEMAC(chr,scals)( scale, sqrt_sumsq ); \
+\
+	/* Store the final value to the output variable. */ \
+	PASTEMAC(chr,copys)( sqrt_sumsq, *norm ); \
+}
+#endif
+GENTFUNCR( float,   float,  s, s, normfv_unb_var1, sumsqv_unb_var1 )
+GENTFUNCR( double,  double, d, d, normfv_unb_var1, sumsqv_unb_var1 )


 #undef  GENTFUNCR
@@ -898,7 +1022,7 @@ void PASTEMAC(ch,varname) \
 				  n_elem - 1, \
 				  &beta, \
 				  x0, incx, \
-                  cntx  \
+				  cntx  \
 				); \
 */ \
 			} \
@@ -937,7 +1061,7 @@ void PASTEMAC(ch,varname) \
 				  n_elem - 1, \
 				  &beta, \
 				  x2, incx, \
-                  cntx  \
+				  cntx  \
 				); \
 */ \
 			} \